Context Navigation

MongoDBAccess.java@ 33906

Last change on this file since 33906 was 33906, checked in by ak19, 4 years ago

Code is intermediate state. 1. Introduced basicDomain field to MongoDB and recreated the MongoDB tables/collections, this will help discount duplicated domains under http and https, with and without www. Though webpage URLs may potentially still be unique and not duplicated across all 4 possible variants, I want them counted under the same base domain name. 2. Another issue noticed now is that some of the sites appear to be hosted on multiple countries servers, and so slightly different country code counts and domainlistings are returned. 3. So added code modifications (untested) to sort the domains alphabetically after stripping protocol and www to allow comparing the old domainListing results of MongoDB's now renamed oldWebsites and oldWebpages collections to the new versions of these collections and to then update the differences in manual counts.

File size: 27.7 KB

Line
1	package org.greenstone.atea;
2
3	//import org.bson.BSONObject;
4
5	import com.mongodb.client.AggregateIterable;
6	import com.mongodb.client.MongoCollection;
7	import com.mongodb.client.MongoDatabase;
8	//import com.mongodb.client.MongoIterable;
9
10	// to use collection.find() filters like eq(), regex() etc
11	import static com.mongodb.client.model.Filters.*;
12	// to use collection.find().projection() filters like include() etc
13	import static com.mongodb.client.model.Projections.*;
14	// to use aggregation functions like unwind(), match(), sort() etc
15	import static com.mongodb.client.model.Aggregates.*;
16	// to use functions like sum() and addToSet() within aggregation functions
17	import static com.mongodb.client.model.Accumulators.*;
18
19	//import org.bson.conversions.Bson;
20	import com.mongodb.BasicDBObject;
21	import com.mongodb.MongoClient;
22	import com.mongodb.MongoCredential;
23	import com.mongodb.ServerAddress;
24	import com.mongodb.MongoClientOptions;
25
26	import com.mongodb.Block;
27
28	import org.bson.BsonArray;
29	import org.bson.BsonString;
30	import org.bson.Document;
31	import org.bson.conversions.Bson;
32	import org.bson.json.JsonMode;
33	import org.bson.json.JsonWriterSettings;
34
35	import com.mongodb.util.JSON;
36	//import com.mongodb.DBObject;
37
38
39	import com.google.gson.*; // for pretty printing
40
41	import java.io.BufferedReader;
42	import java.io.BufferedWriter;
43	import java.io.File;
44	import java.io.FileReader;
45	import java.io.FileWriter;
46	import java.io.IOException;
47	import java.io.UncheckedIOException;
48	import java.io.Writer;
49
50	import java.util.Arrays;
51	import java.util.ArrayList;
52	import java.util.List;
53	import java.util.Properties;
54	import java.util.regex.Pattern;
55
56	import org.apache.log4j.Logger;
57
58	import org.greenstone.atea.morphia.*;
59	import dev.morphia.*;
60
61	import org.apache.commons.csv.*;
62
63	/**
64	* https://www.tutorialspoint.com/mongodb/mongodb_java.htm
65	*
66	* TO COMPILE:
67	* maori-lang-detection/src$
68	* javac -cp ".:../conf:../lib/*" org/greenstone/atea/MongoDBAccess.java
69	*
70	* TO RUN:
71	* java -cp ".:../conf:../lib/*" org.greenstone.atea.MongoDBAccess
72	*
73	* Manually connecting to mongodb from client:
74	* mongo 'mongodb://mongodb.cms.waikato.ac.nz:27017' -u USERNAME -p
75	* Then after connecting with pwd, type:
76	* use DBNAME
77	*
78	* Or connect to mongodb and specify db in one statement:
79	* mongo 'mongodb://mongodb.cms.waikato.ac.nz:27017/DBNAME?authSource=admin' -u USERNAME -p
80	*
81	* Some links:
82	* - https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection
83	* - https://docs.mongodb.com/manual/reference/glossary/ (particularly "collection")
84	* - https://tecadmin.net/tutorial/mongodb/drop-collection/
85	* IMPORTANT LINK:
86	* - https://www.mongodb.com/blog/post/6-rules-of-thumb-for-mongodb-schema-design-part-1
87	*
88	* API:
89	* - https://mongodb.github.io/mongo-java-driver/3.4/javadoc/?com/mongodb/client/MongoCollection.html#find--
90	* - examples: https://mongodb.github.io/mongo-java-driver/3.4/driver/getting-started/quick-start/
91	*/
92	public class MongoDBAccess implements AutoCloseable {
93
94	private static Logger logger = Logger.getLogger(org.greenstone.atea.MongoDBAccess.class.getName());
95
96	static final String PROPS_FILENAME = "config.properties";
97	public static final String WEBPAGES_COLLECTION = "Webpages";
98	public static final String WEBSITES_COLLECTION = "Websites";
99
100	public static final String NEWLINE = System.getProperty("line.separator");
101
102	/** mongodb filter types to execute */
103	public static final int IS_MRI = 0;
104	public static final int CONTAINS_MRI = 1;
105
106	/** Some reused fieldnames in the Websites collection */
107	private static final String FILTER_NUMPAGES_IN_MRI = "numPagesInMRI";
108	private static final String FILTER_NUMPAGES_CONTAINING_MRI = "numPagesContainingMRI";
109
110	// configuration details, some with fallback values
111	private String HOST = "localhost";
112	private int PORT = 27017; // mongodb port
113	private String USERNAME;
114	private String PASSWORD;
115	private String DB_NAME ="ateacrawldata";
116
117	private MongoClient mongo = null;
118	private MongoDatabase database = null;
119
120	/**
121	* Mongodb Client handle via morphia, which handles the ODM (object document mapper)
122	* for MongoDB
123	*/
124	public Datastore datastore = null;
125
126	public MongoDBAccess() throws Exception {
127	boolean success = false;
128
129	// Read in the username and password from our props file
130	Properties props = new Properties();
131
132	//File propsFile = new File(PROPS_FILENAME);
133	//logger.debug("*** Conf props filename: " + propsFile.getAbsolutePath());
134	try {
135	props.load(getClass().getClassLoader().getResourceAsStream(PROPS_FILENAME));
136	} catch(Exception e) {
137	logger.error(e);
138	}
139
140
141	USERNAME = props.getProperty("mongodb.user", "");
142	if(USERNAME.equals("")) {
143	USERNAME = "root";
144	logger.warn("WARNING: No sensible value for mongodb.user specified in " + PROPS_FILENAME + ". Attempting to use: " + USERNAME);
145	}
146	PASSWORD = props.getProperty("mongodb.pwd");
147
148	logger.debug("Got pwd: " + PASSWORD);
149
150	if(PASSWORD != null && PASSWORD.equals("CHANGEME")) {
151
152	success = false;
153	throw new Exception("************ FATAL ERROR: Change DB password in properties file " + PROPS_FILENAME);
154	}
155
156	HOST = props.getProperty("mongodb.host", HOST);
157	String port = props.getProperty("mongodb.port", Integer.toString(PORT));
158	PORT = Integer.parseInt(port);
159	DB_NAME = props.getProperty("mongodb.dbname", DB_NAME);
160
161	logger.info("Connecting to mongodb with:");
162	logger.info(" - host: " + HOST);
163	logger.info(" - port: " + PORT);
164	logger.info(" - user: " + USERNAME);
165	logger.info(" - db name: " + DB_NAME);
166	}
167
168	/**
169	* Since we have only a single MongoClient, don't need to call close/disconnect on it as per
170	* https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection
171	*/
172	public void connectToDB() throws Exception {
173
174	// Creating a Mongo client
175	mongo = new MongoClient( HOST, PORT );
176
177	// Creating Credentials
178	MongoCredential credential;
179	credential = MongoCredential.createCredential(USERNAME, DB_NAME, PASSWORD.toCharArray());
180	System.out.println("Connected to the database successfully");
181
182	// Accessing the database
183	this.database = mongo.getDatabase(DB_NAME);
184	logger.info("Credentials: "+ credential);
185
186	/*
187	MongoCredential credential;
188	credential = MongoCredential.createCredential(USERNAME, DB_NAME, PASSWORD.toCharArray());
189	logger.info("Credentials: "+ credential);
190
191	// Create our Mongo client
192	mongo = new MongoClient( new ServerAddress(HOST, PORT), credential, new MongoClientOptions.Builder().build());
193	System.out.println("Connected to the database successfully");
194
195	this.database = mongo.getDatabase(DB_NAME);
196	*/
197
198	Morphia morphia = new Morphia();
199	morphia.mapPackage("com.greenstone.atea.morphia");
200	datastore = morphia.createDatastore(mongo, DB_NAME);
201	datastore.ensureIndexes();
202
203	}
204
205	// TODO: which fields should be indexed?
206
207	public void showCollections() {
208	//MongoIterable<String> colls = this.database.listCollectionNames();
209	for(String coll : this.database.listCollectionNames()) {
210	System.err.println("coll: " + coll);
211	}
212	}
213
214	/*
215	public void insertWebsiteInfo(WebsiteInfo website)
216	{
217	MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);
218	Document document = new Document("_id", website.id)
219	.append("siteFolderName", website.siteFolderName)
220	.append("domain", website.domain)
221	.append("basicDomain", website.basicDomain)
222	.append("totalPages", website.totalPages)
223	.append("numPagesWithBodyText", website.countOfWebPagesWithBodyText)
224	.append("numPagesInMRI", website.numPagesInMRI)
225	.append("siteCrawledTimestamp", website.siteCrawledTimestamp)
226	.append("siteCrawlUnfinished", website.siteCrawlUnfinished)
227	.append("redoCrawl", website.redoCrawl);
228
229	document.put("urlContainsLangCodeInpath", website.urlContainsLangCodeInpath);
230	if(website.geoLocationCountryCode != null && !website.geoLocationCountryCode.equals("")) {
231	document.put("countryCode", website.geoLocationCountryCode);
232	}
233
234	collection.insertOne(document);
235	logger.debug("Website info for " + website.id + "(" + website.siteFolderName + ")"
236	+ " inserted successfully into " + WEBSITES_COLLECTION);
237	}
238	*/
239
240	/**
241	* Inserts a web page into the mongodb. Besides page related metadata and full body text
242	* the language information per sentence and per 2 adjacent sentences also get stored
243	* into the mongodb.
244	*/
245	/*
246	public void insertWebpageInfo(WebpageInfo webpage)
247	{
248	int mri_sentence_count = 0;
249
250	// load the webpages db 'table'
251	// in mongodb, the equivalent of db tables are called 'collections'
252	MongoCollection<Document> collection = this.database.getCollection(WEBPAGES_COLLECTION);
253
254	Document document = new Document("_id", webpage.webpageID)
255	.append("siteid", webpage.websiteID)
256	.append("url", webpage.URL)
257	.append("isMRI", webpage.isMRI)
258	.append("totalSentences", webpage.totalSentences)
259	.append("charEncoding", webpage.charEncoding)
260	.append("modTime", webpage.modifiedTime)
261	.append("fetchTime", webpage.fetchTime);
262
263	// INSTEAD, ARRAY OF OBJECTS TO BE INSERTED AS PER:
264	// https://stackoverflow.com/questions/15371839/how-to-add-an-array-to-a-mongodb-document-using-java
265	List<BasicDBObject> sentencesList = new ArrayList<>();
266	for(SentenceInfo sentenceInfo : webpage.singleSentences) {
267
268	BasicDBObject bsonRecord = new BasicDBObject("langCode", sentenceInfo.langCode);
269
270	bsonRecord.put("confidence", sentenceInfo.confidenceLevel);
271	bsonRecord.put("sentence", sentenceInfo.sentence);
272
273	sentencesList.add(bsonRecord);
274
275	if(sentenceInfo.langCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) {
276	mri_sentence_count++;
277	}
278
279	}
280	document.put("singleSentences", sentencesList);
281
282	List<BasicDBObject> overlappingSentencesList = new ArrayList<>();
283	for(SentenceInfo sentenceInfo : webpage.overlappingSentences) {
284
285	BasicDBObject bsonRecord = new BasicDBObject("langCode", sentenceInfo.langCode);
286	bsonRecord.put("confidence", sentenceInfo.confidenceLevel);
287	bsonRecord.put("sentence", sentenceInfo.sentence);
288
289	overlappingSentencesList.add(bsonRecord);
290	}
291	document.put("overlappingSentences", overlappingSentencesList);
292
293	// also put the full text in there
294	document.put("text", webpage.text);
295
296	// also store the count of sentences in MRI
297	webpage.setMRISentenceCount(mri_sentence_count);
298	document.put("mriSentenceCount", mri_sentence_count);
299
300
301	collection.insertOne(document);
302	logger.debug("\nwebpage info for " + webpage.webpageID + " inserted successfully into " + WEBPAGES_COLLECTION);
303	}
304	*/
305
306	public ArrayList<String> queryAllMatchingIsMRIURLs(String domain) {
307	return queryAllMatchingURLsFilteredBy(domain, IS_MRI);
308	}
309	public ArrayList<String> queryAllMatchingcontainsMRIURLs(String domain) {
310	return queryAllMatchingURLsFilteredBy(domain, CONTAINS_MRI);
311	}
312
313	/**
314	* Java mongodb find: https://mongodb.github.io/mongo-java-driver/3.4/driver/getting-started/quick-start/
315	* Java mongodb find filters: https://mongodb.github.io/mongo-java-driver/3.4/javadoc/?com/mongodb/client/model/Filters.html
316	* Java mongodb projection: https://stackoverflow.com/questions/44894497/retrieving-data-with-mongodb-java-driver-3-4-using-find-method-with-projection
317	* mongodb projection: https://docs.mongodb.com/v3.2/reference/method/db.collection.find/#db.collection.find
318	*
319	* Parse MongoDB query into Java: https://stackoverflow.com/questions/17326747/parsing-strings-to-mongodb-query-documents-with-operators-in-java
320	* Maybe also https://stackoverflow.com/questions/48000891/parse-mongodb-json-query-in-java-with-multiple-criteria
321	* https://stackoverflow.com/questions/55029222/parse-mongodb-query-to-java
322	* http://pingax.com/trick-convert-mongo-shell-query-equivalent-java-objects/
323	*/
324	public ArrayList<String> queryAllMatchingURLsFilteredBy(String domain, int filterType) {
325
326	final ArrayList<String> urlsList = new ArrayList<String>();
327
328	// remove any http(s)://(www.) from the start of URL first
329	// since it goes into a regex
330	domain = Utility.stripProtocolAndWWWFromURL(domain);
331
332	// load the "webpages" db table
333	// in mongodb, the equivalent of db tables are called 'collections'
334	MongoCollection<Document> collection = this.database.getCollection(WEBPAGES_COLLECTION);
335
336	// code we'll execute in Iterable.forEach() below
337	// see also https://www.baeldung.com/foreach-java
338	Block<Document> storeURL = new Block<Document>() {
339	@Override
340	public void apply(final Document document) {
341	//System.out.println(document.toJson());
342	String url = document.getString("URL");
343	// add to our urlsList
344	//System.out.println(url);
345	urlsList.add(url);
346	}
347	};
348
349
350	// Run the following mongodb query:
351	// db.getCollection('Webpages').find({URL: /domain/, isMRI: true}, {URL: 1, _id: 0})
352
353	// 1. One way that works:
354	//collection.find(and(eq("isMRI", true), regex("URL", pattern))).projection(fields(include("URL"), excludeId())).forEach(storeURL);
355
356	// 2. Another way:
357	//String query = "{URL: /DOMAIN/, isMRI: true}";
358	String query = "{URL: /DOMAIN/, ";
359	if(filterType == IS_MRI) {
360	query += "isMRI: true}";
361	} else if(filterType == CONTAINS_MRI) {
362	query += "containsMRI: true}";
363	}
364
365	domain = domain.replace(".", "\\."); // escape dots in domain for regex
366	query = query.replace("DOMAIN", domain);
367
368	//System.err.println("Executing find query: " + query);
369
370	BasicDBObject findObj = BasicDBObject.parse(query);
371	BasicDBObject projectionObj = BasicDBObject.parse("{URL: 1, _id: 0}");
372
373
374	collection.find(findObj).projection(projectionObj).forEach(storeURL);
375
376	return urlsList;
377	}
378
379	/**
380	* RUNNING A MONGODB COLLECTION.AGGREGATE() in JAVA:
381	*
382	* https://stackoverflow.com/questions/31643109/mongodb-aggregation-with-java-driver
383	* https://stackoverflow.com/questions/48000891/parse-mongodb-json-query-in-java-with-multiple-criteria
384	* Not Java: https://stackoverflow.com/questions/39060221/a-pipeline-stage-specification-object-must-contain-exactly-one-field-with-php-mo
385	*
386	* (https://stackoverflow.com/questions/55029222/parse-mongodb-query-to-java)
387	* https://www.programcreek.com/java-api-examples/?api=com.mongodb.client.model.Aggregates
388	* On using group(TExpression) inside collection.aggregate().
389	*
390	* For forEach lamba expressions, see also https://www.baeldung.com/foreach-java
391	* and https://www.javatpoint.com/java-8-foreach
392	* and https://stackoverflow.com/questions/47979978/ambiguous-reference-to-foreach-when-listing-mongodbs-database-in-java
393	*
394	* Count by country code of non-NZ websites containing a positive number of sentences in MRI,
395	* listing all the base domain strings (no protocol or www) in ALPHABETICAL ORDER
396	* and total counts of numPagesInMRI and numPagesContainingMRI across all these
397	* matching sites.
398	*
399	* The mongodb aggregate() we want to run this time:
400	*
401	db.Websites.aggregate([
402	{
403	$match: {
404	$and: [
405	{numPagesContainingMRI: {$gt: 0}},
406	{$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz/}]}
407	]
408	}
409	},
410	{ $unwind: "$geoLocationCountryCode" },
411	{
412	$group: {
413	_id: "nz",
414	count: { $sum: 1 },
415	domain: { $addToSet: '$basicDomain' } // domain: {$push: "$basicDomain" }
416	}
417	},
418	{ $sort : { count : -1} }
419	]);
420	*/
421	public void aggregateContainsMRIForNZ(Writer writer, int filterType) throws IOException {
422	// working with the WebSites collection, not WebPages collection!
423	MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);
424
425	String mriFilterString = (filterType == CONTAINS_MRI) ? "{numPagesContainingMRI: {$gt: 0}}" : "{numPagesInMRI: {$gt: 0}}";
426
427	Bson orQuery = or(
428	BasicDBObject.parse("{geoLocationCountryCode: \"NZ\"}"),
429	BasicDBObject.parse("{domain: /\\.nz/}")
430	);
431	Bson andQuery = and(
432	BasicDBObject.parse(mriFilterString),
433	orQuery);
434
435	// Hopefully the lambda expression (forEach()) at end means
436	// we write out each result Document as we get it
437	collection.aggregate(Arrays.asList(
438	match(andQuery),
439	unwind("$geoLocationCountryCode"),
440	group("NZ", Arrays.asList(sum("count", 1),
441	addToSet("domain", "$basicDomain"))),
442	sort(BasicDBObject.parse("{count : -1}"))
443	)).forEach((Block<Document>)doc -> writeDoc(doc, writer));
444
445	// should only have one doc for NZ since it's a count by geolocation.
446
447	return;
448	}
449
450	/**
451	* Count of NZ (incl .nz TLD) websites containing a positive number of sentences in MRI,
452	* listing all the base domain strings (no protocol or www) in ALPHABETICAL ORDER
453	* and total counts of numPagesInMRI and numPagesContainingMRI across all these
454	* matching sites.
455	*
456	* The aggregate() we want to run this time:
457	*
458	db.Websites.aggregate([
459	{
460	$match: {
461	$and: [
462	{geoLocationCountryCode: {$ne: "NZ"}},
463	{domain: {$not: /\.nz/}},
464	{numPagesContainingMRI: {$gt: 0}},
465	{$or: [{geoLocationCountryCode: "AU"}, {urlContainsLangCodeInPath: false}]}
466	]
467	}
468	},
469	{ $unwind: "$geoLocationCountryCode" },
470	{
471	$group: {
472	_id: {$toLower: '$geoLocationCountryCode'},
473	count: { $sum: 1 },
474	domain: { $addToSet: '$basicDomain' } // domain: {$push: "$basicDomain" }
475	}
476	},
477	{ $sort : { count : -1} }
478	]);
479	*/
480	public void aggregateContainsMRIForOverseas(Writer writer, int filterType,
481	boolean isMiInURLPath) throws UncheckedIOException
482	{
483	// working with the WebSites collection, not WebPages collection!
484	MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);
485
486	String mriFilterString = (filterType == CONTAINS_MRI) ? "{numPagesContainingMRI: {$gt: 0}}" : "{numPagesInMRI: {$gt: 0}}";
487
488	Bson orQuery = or(
489	BasicDBObject.parse("{geoLocationCountryCode: \"AU\"}"),
490	BasicDBObject.parse("{urlContainsLangCodeInPath: "+ isMiInURLPath +"}")
491	// e.g. "{urlContainsLangCodeInPath: false}"
492	);
493	Bson andQuery = and(
494	BasicDBObject.parse("{geoLocationCountryCode: {$ne: \"NZ\"}}"),
495	BasicDBObject.parse("{domain: {$not: /\\.nz/}}"),
496	BasicDBObject.parse(mriFilterString),
497	orQuery);
498
499	collection.aggregate(Arrays.asList(
500	match(andQuery), //match(BasicDBObject.parse(matchQuery))
501	// match((List<DBObject>)JSON.parse(matchQuery)),
502	unwind("$geoLocationCountryCode"),
503	group("$geoLocationCountryCode", Arrays.asList(sum("count", 1),
504	addToSet("domain", "$basicDomain"))),
505	sort(BasicDBObject.parse("{count : -1}"))
506	)).forEach((Block<Document>)doc -> writeDoc(doc, writer));
507
508	// casting to Block<Document> necessary because otherwise we see the error at
509	// https://stackoverflow.com/questions/47979978/ambiguous-reference-to-foreach-when-listing-mongodbs-database-in-java
510
511	// Less efficient way is to keep all the results in memory and then
512	// write them out one at a time
513	/*
514	AggregateIterable<Document> output
515	= collection.aggregate(Arrays.asList(
516	match(andQuery), //match(BasicDBObject.parse(matchQuery))
517	// match((List<DBObject>)JSON.parse(matchQuery)),
518	unwind("$geoLocationCountryCode"),
519	group("$geoLocationCountryCode", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))),
520	sort(BasicDBObject.parse("{count : -1}"))
521	));
522
523
524	for (Document doc : output) {
525	//System.out.println(doc);
526	System.out.println(doc.toJson());
527
528	}
529	*/
530	return;
531	}
532
533	/** Do the aggregates for writing out tables.
534	Table1:
535
536	*/
537	public void writeTables(File outFolder) {
538	// In this function, we're always dealing with the Websites mongodb collection.
539	MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);
540
541	String[] tableNames = { "", "1table_allCrawledSites", "2table_sitesWithPagesInMRI"};
542	for (int tableNum = 1; tableNum < tableNames.length; tableNum++) {
543	File outFile = new File(outFolder, tableNames[tableNum] + ".json");
544	File csvFile = new File(outFolder, tableNames[tableNum] + ".csv");
545	try (
546	Writer writer = new BufferedWriter(new FileWriter(outFile));
547	CSVPrinter csvWriter = new CSVPrinter(new FileWriter(csvFile), CSVFormat.DEFAULT);
548	) {
549
550	// Write out the CSV column headings
551	// https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVPrinter.html
552	csvWriter.printRecord("countryCode", "siteCount",
553	"numPagesInMRI count","numPagesContainingMRICount"/, "domain"/);
554
555	AggregateIterable<Document> output = getTable(collection, tableNum); //doTable1().forEach((Block<Document>)doc -> writeDoc(doc, writer));
556
557	int docNum = 0;
558	for (Document doc : output) {
559	//System.out.println(doc);
560	writeDocAsJsonRecord(++docNum, doc, writer);
561	writeDocAsCSVRecord(++docNum, doc, csvWriter);
562	}
563	logger.info("@@@ Wrote out table into file: " + Utility.getFilePath(outFile) + " and .csv");
564	} catch(UncheckedIOException ioe) {
565	logger.error("Caught UncheckedIOException: " + ioe.getMessage(), ioe);
566	}
567	catch(Exception e) {
568	logger.error("Could not write table to file " + outFile + " or .csv equivalent" , e);
569	}
570	}
571	}
572
573	public AggregateIterable<Document> getTable(MongoCollection<Document> collection, int tableNum) {
574
575	AggregateIterable<Document> output = null;
576
577	switch(tableNum) {
578
579	case 1:
580	/* 1table_allCrawledSites -
581
582	db.Websites.aggregate([
583	{ $unwind: "$geoLocationCountryCode" },
584	{
585	$group: {
586	_id: "$geoLocationCountryCode",
587	count: { $sum: 1 },
588	//domain: { $addToSet: '$domain' },
589	numPagesInMRICount: { $sum: '$numPagesInMRI' },
590	numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' }
591	}
592	},
593	{ $sort : { count : -1} }
594	]);
595	*/
596	output = collection.aggregate(Arrays.asList(
597	//match(BasicDBObject.parse("{urlContainsLangCodeInPath:true}")),
598	unwind("$geoLocationCountryCode"),
599	group("$geoLocationCountryCode", Arrays.asList(
600	sum("count", 1),
601	/addToSet("domain", "$domain"),/
602	sum("numPagesInMRICount", "$numPagesInMRI"),
603	sum("numPagesContainingMRICount", "$numPagesContainingMRI"))),
604	sort(BasicDBObject.parse("{count : -1}"))
605	));
606	break;
607
608	case 2:
609	/*
610	db.Websites.aggregate([
611	{ $match: { numPagesInMRI: {$gt: 0} } },
612	{ $unwind: "$geoLocationCountryCode" },
613	{
614	$group: {
615	_id: {$toLower: '$geoLocationCountryCode'}, // ignore toLower
616	count: { $sum: 1 },
617	//domain: { $addToSet: '$domain' },
618	numPagesInMRICount: { $sum: '$numPagesInMRI' },
619	numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' }
620	}
621	},
622	{ $sort : { count : -1} }
623	]);
624	*/
625	output = collection.aggregate(Arrays.asList(
626	match(BasicDBObject.parse("{ numPagesInMRI: {$gt: 0} }")),
627	unwind("$geoLocationCountryCode"),
628	group("$geoLocationCountryCode", Arrays.asList(
629	sum("count", 1),
630	/addToSet("domain", "$domain"),/
631	sum("numPagesInMRICount", "$numPagesInMRI"),
632	sum("numPagesContainingMRICount", "$numPagesContainingMRI"))),
633	sort(BasicDBObject.parse("{count : -1}"))
634	));
635	break;
636
637	default: logger.error("Unknown table number: " + tableNum);
638
639	}
640
641	return output;
642
643	}
644
645
646
647	/**
648	* called by lambda forEach() call on Document objects to write them out to a file.
649	* Have to deal with unreported exceptions here that can't be dealt with when doing
650	* the actual forEach(). See
651	* https://stackoverflow.com/questions/39090292/how-to-cleanly-deal-with-unreported-exception-ioexception-in-stream-foreach
652	*/
653	public void writeDoc(Document doc, Writer writer) throws UncheckedIOException {
654
655	// If there's a domain field in the json Doc, sort this domain listing alphabetically
656	Object domainList = doc.remove("domain");
657	if(domainList != null) {
658	doc.put("domain", sortAlphabetically(domainList));
659	}
660
661	//OLD WAY: writer.write(doc.toJson(new JsonWriterSettings(JsonMode.STRICT, true)) + NEWLINE);
662	// Can't control json output to add newlines after each array element,
663	// no matter which JsonMode is used.
664
665	// https://mongodb.github.io/mongo-java-driver/3.9/javadoc/index.html?org/bson/json/JsonWriterSettings.html
666	// Still can't control array element output,
667	// but this way uses newer mongo java driver 3.9(.1). Tried its various JsonModes too:
668	//JsonWriterSettings writeSettings = new JsonWriterSettings();
669	//writeSettings.builder().outputMode(JsonMode.SHELL).indent(true).build();
670	//writer.write(doc.toJson(writeSettings) + NEWLINE);
671
672	// Not the JsonWriter of mongodb java driver:
673	// https://stackoverflow.com/questions/54746814/jsonwriter-add-a-new-line
674
675	// Have to use gson's pretty print to produce a json string that contains
676	// newlines after every array element in the json:
677
678	String jsonStr = prettyPrintJson(doc.toJson());
679	//System.err.println(jsonStr);
680	try {
681	writer.write(jsonStr + NEWLINE);
682	} catch (IOException ex) {
683	//throw ex;
684	throw new UncheckedIOException(ex);
685	}
686	}
687
688	private List sortAlphabetically(Object list) {
689	BsonArray domainList = (BsonArray)list;
690	//for(String domain : domainList) {
691	for(int i = domainList.size() - 1; i >= 0; i--) {
692	BsonString domain = domainList.get(i).asString();
693	String domainStr = Utility.stripProtocolAndWWWFromURL(domain.toString());
694	domainList.set(i, new BsonString(domainStr));
695	}
696
697	return domainList;
698	}
699
700	public void writeDocAsJsonRecord(int docNum, Document doc, Writer writer) throws UncheckedIOException {
701	String jsonStr = prettyPrintJson(doc.toJson());
702	//System.err.println(jsonStr);
703	try {
704	writer.write("/* " + docNum + " */\n" + jsonStr + NEWLINE);
705	} catch (IOException ex) {
706	//throw ex;
707	throw new UncheckedIOException(ex);
708	}
709	}
710
711	// TODO
712	//public void writeDocToJsonAndCSV(int docNum, Document doc, Writer writer, CSVPrinter csvWriter) throws UncheckedIOException {
713	public void writeDocAsCSVRecord(int docNum, Document doc, CSVPrinter csvWriter) throws UncheckedIOException {
714	String jsonStr = doc.toJson();
715	JsonParser parser = new JsonParser();
716	JsonElement json = parser.parse(jsonStr);
717
718	JsonObject jsonObj = (JsonObject)json;
719
720	String countryCode = jsonObj.get("_id").getAsString();
721	int siteCount = jsonObj.get("count").getAsInt();
722	int numPagesInMRICount = jsonObj.get("numPagesInMRICount").getAsInt();
723	int numPagesContainingMRICount = jsonObj.get("numPagesContainingMRICount").getAsInt();
724
725	//System.err.println(jsonStr);
726	try {
727	//writer.write("/* " + docNum + " */\n" + prettyPrintJson(jsonStr) + NEWLINE);
728	csvWriter.printRecord(countryCode, siteCount, numPagesInMRICount, numPagesContainingMRICount);
729	} catch (IOException ex) {
730	//throw ex;
731	throw new UncheckedIOException(ex);
732	}
733	}
734
735	public String prettyPrintJson(String jsonStr) {
736	Gson gson = new GsonBuilder().setPrettyPrinting().create();
737	JsonParser jp = new JsonParser();
738	JsonElement je = jp.parse(jsonStr);
739	String prettyJsonString = gson.toJson(je);
740	return prettyJsonString;
741	}
742
743
744	/** https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection */
745	public void close() {}
746
747
748	// TODO:
749	// In the database, need to ensure we have else
750	// create collection (table in RDBMS) websites, create collection webpages.
751	// The webpages collection will have sentences embedded based on my decisions from
752	// reading the series
753	// https://www.mongodb.com/blog/post/6-rules-of-thumb-for-mongodb-schema-design-part-1
754	// Then need functions:
755	// insertWebsiteDocument()
756	// insertWebpageDocument()
757
758	public static void main(String args[]) {
759	try {
760	MongoDBAccess mongodbCon = new MongoDBAccess();
761	mongodbCon.connectToDB();
762	mongodbCon.showCollections();
763
764	} catch(Exception e) {
765	e.printStackTrace();
766	}
767	}
768	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java@ 33906

Download in other formats: