Context Navigation

MongoDBQueryer.java@ 33919

Last change on this file since 33919 was 33919, checked in by ak19, 4 years ago

SummaryTool now uses the CountryCodeCountsMapData.java class to generate the geojson-features files from the tables it already created using MongoDB query results. Switched over from geojson.tools to geojson.io since the latter allows passing geojson mapdata in the URL. The firefox screenshotting is still not working. But I can't even get complex geojson features to work from the commandline yet, so then there's another possible layer of complexity when running firefox as a Java process. Added jna jar files used by Greenstone's SafeProcess for launching Firefox as a Java process.

File size: 31.9 KB

Line
1	package org.greenstone.atea;
2
3	//import org.bson.BSONObject;
4
5	import com.mongodb.client.AggregateIterable;
6	import com.mongodb.client.MongoCollection;
7
8	// to use collection.find() filters like eq(), regex() etc
9	import static com.mongodb.client.model.Filters.*;
10	// to use collection.find().projection() filters like include() etc
11	import static com.mongodb.client.model.Projections.*;
12	// to use aggregation functions like unwind(), match(), sort() etc
13	import static com.mongodb.client.model.Aggregates.*;
14	// to use functions like sum() and addToSet() within aggregation functions
15	import static com.mongodb.client.model.Accumulators.*;
16
17	//import org.bson.conversions.Bson;
18	import com.mongodb.BasicDBObject;
19
20
21	import com.mongodb.Block;
22
23	import org.bson.BsonArray;
24	import org.bson.BsonString;
25	import org.bson.BsonValue;
26	import org.bson.Document;
27	import org.bson.conversions.Bson;
28	import org.bson.json.JsonMode;
29	import org.bson.json.JsonWriterSettings;
30
31	import com.mongodb.util.JSON;
32	//import com.mongodb.DBObject;
33
34
35	import com.google.gson.*; // for pretty printing
36
37	import java.io.BufferedReader;
38	import java.io.BufferedWriter;
39	import java.io.File;
40	import java.io.FileReader;
41	import java.io.FileWriter;
42	import java.io.IOException;
43	import java.io.UncheckedIOException;
44	import java.io.Writer;
45	import javax.xml.ws.Holder;
46
47
48	import java.util.Arrays;
49	import java.util.ArrayList;
50	import java.util.List;
51	import java.util.TreeSet;
52
53
54	import org.apache.log4j.Logger;
55	import org.apache.commons.csv.*;
56
57	/**
58	* https://www.tutorialspoint.com/mongodb/mongodb_java.htm
59	*
60	* TO COMPILE:
61	* maori-lang-detection/src$
62	* javac -cp ".:../conf:../lib/*" org/greenstone/atea/MongoDBQueryer.java
63	*
64	* TO RUN:
65	* java -cp ".:../conf:../lib/*" org.greenstone.atea.MongoDBQueryer
66	*
67	* Manually connecting to mongodb from client:
68	* mongo 'mongodb://mongodb.cms.waikato.ac.nz:27017' -u USERNAME -p
69	* Then after connecting with pwd, type:
70	* use DBNAME
71	*
72	* Or connect to mongodb and specify db in one statement:
73	* mongo 'mongodb://mongodb.cms.waikato.ac.nz:27017/DBNAME?authSource=admin' -u USERNAME -p
74	*
75	* Some links:
76	* - https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection
77	* - https://docs.mongodb.com/manual/reference/glossary/ (particularly "collection")
78	* - https://tecadmin.net/tutorial/mongodb/drop-collection/
79	* IMPORTANT LINK:
80	* - https://www.mongodb.com/blog/post/6-rules-of-thumb-for-mongodb-schema-design-part-1
81	*
82	* API:
83	* - https://mongodb.github.io/mongo-java-driver/3.4/javadoc/?com/mongodb/client/MongoCollection.html#find--
84	* - examples: https://mongodb.github.io/mongo-java-driver/3.4/driver/getting-started/quick-start/
85	*/
86	public class MongoDBQueryer extends MongoDBAccess {
87
88	private static Logger logger = Logger.getLogger(org.greenstone.atea.MongoDBQueryer.class.getName());
89
90	public static final String NEWLINE = System.getProperty("line.separator");
91
92	/** mongodb filter types to execute */
93	public static final int IS_MRI = 0;
94	public static final int CONTAINS_MRI = 1;
95
96	/** Some reused fieldnames in the Websites collection */
97	private static final String FILTER_NUMPAGES_IN_MRI = "numPagesInMRI";
98	private static final String FILTER_NUMPAGES_CONTAINING_MRI = "numPagesContainingMRI";
99
100
101
102	public MongoDBQueryer() throws Exception {
103	super();
104	}
105
106
107	public ArrayList<String> queryAllMatchingIsMRIURLs(String domain) {
108	return queryAllMatchingURLsFilteredBy(domain, IS_MRI);
109	}
110	public ArrayList<String> queryAllMatchingcontainsMRIURLs(String domain) {
111	return queryAllMatchingURLsFilteredBy(domain, CONTAINS_MRI);
112	}
113
114	/**
115	* Java mongodb find: https://mongodb.github.io/mongo-java-driver/3.4/driver/getting-started/quick-start/
116	* Java mongodb find filters: https://mongodb.github.io/mongo-java-driver/3.4/javadoc/?com/mongodb/client/model/Filters.html
117	* Java mongodb projection: https://stackoverflow.com/questions/44894497/retrieving-data-with-mongodb-java-driver-3-4-using-find-method-with-projection
118	* mongodb projection: https://docs.mongodb.com/v3.2/reference/method/db.collection.find/#db.collection.find
119	*
120	* Parse MongoDB query into Java: https://stackoverflow.com/questions/17326747/parsing-strings-to-mongodb-query-documents-with-operators-in-java
121	* Maybe also https://stackoverflow.com/questions/48000891/parse-mongodb-json-query-in-java-with-multiple-criteria
122	* https://stackoverflow.com/questions/55029222/parse-mongodb-query-to-java
123	* http://pingax.com/trick-convert-mongo-shell-query-equivalent-java-objects/
124	*/
125	public ArrayList<String> queryAllMatchingURLsFilteredBy(String domain, int filterType) {
126
127	final ArrayList<String> urlsList = new ArrayList<String>();
128
129	// remove any http(s)://(www.) from the start of URL first
130	// since it goes into a regex
131	domain = Utility.stripProtocolAndWWWFromURL(domain);
132
133	// load the "webpages" db table
134	// in mongodb, the equivalent of db tables are called 'collections'
135	MongoCollection<Document> collection = getWebpagesCollection();
136
137	// code we'll execute in Iterable.forEach() below
138	// see also https://www.baeldung.com/foreach-java
139	Block<Document> storeURL = new Block<Document>() {
140	@Override
141	public void apply(final Document document) {
142	//System.out.println(document.toJson());
143	String url = document.getString("URL");
144	// add to our urlsList
145	//System.out.println(url);
146	urlsList.add(url);
147	}
148	};
149
150
151	// Run the following mongodb query:
152	// db.getCollection('Webpages').find({URL: /domain/, isMRI: true}, {URL: 1, _id: 0})
153
154	// 1. One way that works:
155	//collection.find(and(eq("isMRI", true), regex("URL", pattern))).projection(fields(include("URL"), excludeId())).forEach(storeURL);
156
157	// 2. Another way:
158	//String query = "{URL: /DOMAIN/, isMRI: true}";
159	String query = "{URL: /DOMAIN/, ";
160	if(filterType == IS_MRI) {
161	query += "isMRI: true}";
162	} else if(filterType == CONTAINS_MRI) {
163	query += "containsMRI: true}";
164	}
165
166	domain = domain.replace(".", "\\."); // escape dots in domain for regex
167	query = query.replace("DOMAIN", domain);
168
169	//System.err.println("Executing find query: " + query);
170
171	BasicDBObject findObj = BasicDBObject.parse(query);
172	BasicDBObject projectionObj = BasicDBObject.parse("{URL: 1, _id: 0}");
173
174
175	collection.find(findObj).projection(projectionObj).forEach(storeURL);
176
177	return urlsList;
178	}
179
180	/**
181	* Does a mongoDB query like the following, depending on filter type:
182	* db.getCollection('Webpages').find({isMRI: true}).count()
183	* @param filterType can be either IS_MRI or CONTAINS_MRI.
184	* @return the number of webpages that matched the filterType setting.
185	*/
186	public long countOfWebpagesMatching(int filterType) {
187	String query = (filterType == IS_MRI) ? "{isMRI: true}" : "{containsMRI: true}";
188	long result = -1;
189	MongoCollection<Document> collection = getWebpagesCollection();
190
191
192	try {
193	BasicDBObject queryObj = BasicDBObject.parse(query);
194	//result = collection.find(queryObj).count();
195	// https://stackoverflow.com/questions/32683458/how-to-call-count-operation-after-find-with-mongodb-java-driver
196	result = collection.countDocuments(queryObj);
197
198	} catch(Exception e) {
199	logger.error("MongoDB couldn't parse provided query " + query);
200	}
201
202	return result;
203	}
204
205	/**
206	* RUNNING A MONGODB COLLECTION.AGGREGATE() in JAVA:
207	*
208	* https://stackoverflow.com/questions/31643109/mongodb-aggregation-with-java-driver
209	* https://stackoverflow.com/questions/48000891/parse-mongodb-json-query-in-java-with-multiple-criteria
210	* Not Java: https://stackoverflow.com/questions/39060221/a-pipeline-stage-specification-object-must-contain-exactly-one-field-with-php-mo
211	*
212	* (https://stackoverflow.com/questions/55029222/parse-mongodb-query-to-java)
213	* https://www.programcreek.com/java-api-examples/?api=com.mongodb.client.model.Aggregates
214	* On using group(TExpression) inside collection.aggregate().
215	*
216	* For forEach lamba expressions, see also https://www.baeldung.com/foreach-java
217	* and https://www.javatpoint.com/java-8-foreach
218	* and https://stackoverflow.com/questions/47979978/ambiguous-reference-to-foreach-when-listing-mongodbs-database-in-java
219	*
220	* Count of NZ (incl .nz TLD) websites containing a positive number of sentences in MRI,
221	* listing all the base domain strings (unsorted with protocol and any www)
222	* and total counts of numPagesInMRI and numPagesContainingMRI across all these
223	* matching sites.
224	*
225	* The mongodb aggregate() we want to run this time:
226	*
227	db.Websites.aggregate([
228	{
229	$match: {
230	$and: [
231	{numPagesContainingMRI: {$gt: 0}},
232	{$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz/}]}
233	]
234	}
235	},
236	{ $unwind: "$geoLocationCountryCode" },
237	{
238	$group: {
239	_id: "nz",
240	count: { $sum: 1 },
241	domain: { $addToSet: '$domain' }
242	}
243	},
244	{ $sort : { count : -1} }
245	]);
246	*/
247	public void aggregateContainsMRIForNZ(Writer writer, int filterType) throws IOException {
248	// working with the WebSites collection, not WebPages collection!
249	MongoCollection<Document> collection = getWebsitesCollection();
250
251	String mriFilterString = (filterType == CONTAINS_MRI) ? "{numPagesContainingMRI: {$gt: 0}}" : "{numPagesInMRI: {$gt: 0}}";
252
253	// Want a counter,
254	// but lambda expressions can only take final variables and those can't be incremented
255	// But can use an array to store incrementable counter or Holder type
256	// https://stackoverflow.com/questions/28790784/java-8-preferred-way-to-count-iterations-of-a-lambda
257	Holder<Integer> docNum = new Holder<>(0);
258
259	Bson orQuery = or(
260	BasicDBObject.parse("{geoLocationCountryCode: \"NZ\"}"),
261	BasicDBObject.parse("{domain: /\\.nz$/}")
262	);
263	Bson andQuery = and(
264	BasicDBObject.parse(mriFilterString),
265	orQuery);
266
267	// Hopefully the lambda expression (forEach()) at end means
268	// we write out each result Document as we get it
269	collection.aggregate(Arrays.asList(
270	match(andQuery),
271	unwind("$geoLocationCountryCode"),
272	group("NZ", Arrays.asList(sum("count", 1),
273	addToSet("domain", "$domain"))),
274	sort(BasicDBObject.parse("{count : -1}"))
275	)).forEach((Block<Document>)doc -> writeDoc(docNum.value++, doc, writer));
276
277	// should only have one doc for NZ since it's a count by geolocation.
278
279	return;
280	}
281
282
283	/**
284	* Count of overseas (non-NZ and non-.nz TLD) websites
285	* containing a positive number of sentences in MRI,
286	* listing all the base domain strings (unsorted with protocol and any www)
287	* and total counts of numPagesInMRI and numPagesContainingMRI across all these
288	* matching sites. Regardless of whether there's an mi in the URL path of any or not.
289	*
290	* The aggregate() we want to run this time:
291	*
292	db.Websites.aggregate([
293	{
294	$match: {
295	$and: [
296	{geoLocationCountryCode: {$ne: "NZ"}},
297	{domain: {$not: /\.nz/}},
298	{numPagesContainingMRI: {$gt: 0}}
299	]
300	}
301	},
302	{ $unwind: "$geoLocationCountryCode" },
303	{
304	$group: {
305	_id: {$toLower: '$geoLocationCountryCode'},
306	count: { $sum: 1 },
307	domain: { $addToSet: '$domain' }
308	}
309	},
310	{ $sort : { count : -1} }
311	]);
312	*/
313	public void aggregateContainsMRIForOverseas(Writer writer, int filterType)
314	throws UncheckedIOException {
315	// working with the WebSites collection, not WebPages collection!
316	MongoCollection<Document> collection = getWebsitesCollection();
317
318	String mriFilterString = (filterType == CONTAINS_MRI) ? "{numPagesContainingMRI: {$gt: 0}}" : "{numPagesInMRI: {$gt: 0}}";
319
320	// Want a counter,
321	// but lambda expressions can only take final variables and those can't be incremented
322	// But can use an array to store incrementable counter or Holder
323	// https://stackoverflow.com/questions/28790784/java-8-preferred-way-to-count-iterations-of-a-lambda
324	Holder<Integer> docNum = new Holder<>(1);
325
326	Bson andQuery = and(
327	BasicDBObject.parse("{geoLocationCountryCode: {$ne: \"NZ\"}}"),
328	BasicDBObject.parse("{domain: {$not: /\\.nz$/}}"),
329	BasicDBObject.parse(mriFilterString));
330
331	collection.aggregate(Arrays.asList(
332	match(andQuery),
333	unwind("$geoLocationCountryCode"),
334	group("$geoLocationCountryCode", Arrays.asList(sum("count", 1),
335	addToSet("domain", "$domain"))),
336	sort(BasicDBObject.parse("{count : -1}"))
337	)).forEach((Block<Document>)doc -> writeDoc(docNum.value++, doc, writer));
338
339	return;
340	}
341
342
343	/** Count by country code of overseas (non-NZ and non-nz TLD) websites
344	* containing a positive number of sentences in MRI,
345	* listing all the base domain strings (unordered and with protocol and any www)
346	* and total counts of numPagesInMRI and numPagesContainingMRI across all these
347	* matching sites.
348	*
349	* The aggregate() we want to run this time:
350	*
351	db.Websites.aggregate([
352	{
353	$match: {
354	$and: [
355	{geoLocationCountryCode: {$ne: "NZ"}},
356	{domain: {$not: /\.nz/}},
357	{numPagesContainingMRI: {$gt: 0}},
358	{$or: [{geoLocationCountryCode: "AU"}, {urlContainsLangCodeInPath: false}]}
359	]
360	}
361	},
362	{ $unwind: "$geoLocationCountryCode" },
363	{
364	$group: {
365	_id: {$toLower: '$geoLocationCountryCode'},
366	count: { $sum: 1 },
367	domain: { $addToSet: '$domain' }
368	}
369	},
370	{ $sort : { count : -1} }
371	]);
372	*/
373	public void aggregateContainsMRIForOverseas(Writer writer, int filterType,
374	boolean isMiInURLPath) throws UncheckedIOException
375	{
376	// working with the WebSites collection, not WebPages collection!
377	MongoCollection<Document> collection = getWebsitesCollection();
378
379	String mriFilterString = (filterType == CONTAINS_MRI) ? "{numPagesContainingMRI: {$gt: 0}}" : "{numPagesInMRI: {$gt: 0}}";
380
381	// Want a counter,
382	// but lambda expressions can only take final variables and those can't be incremented
383	// But can use an array to store incrementable counter or Holder
384	// https://stackoverflow.com/questions/28790784/java-8-preferred-way-to-count-iterations-of-a-lambda
385	Holder<Integer> docNum = new Holder<>(1);
386
387	/*
388	Bson orQuery = or(
389	BasicDBObject.parse("{geoLocationCountryCode: \"AU\"}"),
390	BasicDBObject.parse("{urlContainsLangCodeInPath: "+ isMiInURLPath +"}")
391	// e.g. "{urlContainsLangCodeInPath: false}"
392	);
393	*/
394	Bson andQuery = and(
395	BasicDBObject.parse(mriFilterString),
396	BasicDBObject.parse("{geoLocationCountryCode: {$ne: \"NZ\"}}"),
397	BasicDBObject.parse("{domain: {$not: /\\.nz$/}}"),
398	BasicDBObject.parse("{urlContainsLangCodeInPath: "+ isMiInURLPath +"}"));//orQuery);
399
400	collection.aggregate(Arrays.asList(
401	match(andQuery), //match(BasicDBObject.parse(matchQuery))
402	// match((List<DBObject>)JSON.parse(matchQuery)),
403	unwind("$geoLocationCountryCode"),
404	group("$geoLocationCountryCode", Arrays.asList(sum("count", 1),
405	addToSet("domain", "$domain"))),
406	sort(BasicDBObject.parse("{count : -1}"))
407	)).forEach((Block<Document>)doc -> writeDoc(docNum.value++, doc, writer));
408
409	// casting to Block<Document> necessary because otherwise we see the error at
410	// https://stackoverflow.com/questions/47979978/ambiguous-reference-to-foreach-when-listing-mongodbs-database-in-java
411
412	// Less efficient way is to keep all the results in memory and then
413	// write them out one at a time
414	/*
415	AggregateIterable<Document> output
416	= collection.aggregate(Arrays.asList(
417	match(andQuery), //match(BasicDBObject.parse(matchQuery))
418	// match((List<DBObject>)JSON.parse(matchQuery)),
419	unwind("$geoLocationCountryCode"),
420	group("$geoLocationCountryCode", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))),
421	sort(BasicDBObject.parse("{count : -1}"))
422	));
423
424
425	for (Document doc : output) {
426	//System.out.println(doc);
427	System.out.println(doc.toJson());
428
429	}
430	*/
431	return;
432	}
433
434	/** Perform the aggregates for writing out the summary tables. */
435	public String[] writeTables(File outFolder) {
436	// In this function, we're always dealing with the Websites mongodb collection.
437	MongoCollection<Document> collection = getWebsitesCollection();
438
439	String[] tableNames = {
440	"",
441	"1table_allCrawledSites",
442	"2table_sitesWithPagesInMRI",
443	"3table_sitesWithPagesContainingMRI",
444	"4table_containsMRI_exclTentativeProductSites",
445	"5table_sitesWithPagesContainingMRI_allNZGrouped",
446	"5table_sitesWithPagesInMRI_allNZGrouped"
447	};
448	for (int tableNum = 1; tableNum < tableNames.length; tableNum++) {
449	File outFile = new File(outFolder, tableNames[tableNum] + ".json");
450	File csvFile = new File(outFolder, tableNames[tableNum] + ".csv");
451	try (
452	Writer writer = new BufferedWriter(new FileWriter(outFile));
453	CSVPrinter csvWriter = new CSVPrinter(new FileWriter(csvFile), CSVFormat.DEFAULT.withQuoteMode(QuoteMode.MINIMAL)); // quote ALL vs MINIMAL vs NON_NUMERIC fields
454	) {
455
456	// Write out the CSV column headings
457	// https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVPrinter.html
458	csvWriter.printRecord("countryCode", "siteCount",
459	"numPagesInMRI count","numPagesContainingMRICount",
460	"totalPagesAcrossMatchingSites"/, "domain"/);
461
462	AggregateIterable<Document> output = getTable(collection, tableNum); //doTable1().forEach((Block<Document>)doc -> writeDoc(doc, writer));
463
464
465	int docNum = 0;
466	// get any NZ specific row's data, if it exists for this table
467	// and add that as the docNum="0th" doc
468	Document nzDoc = getNZTableRowData(collection, tableNum);
469	if(nzDoc != null) {
470	writeDocAsJsonRecord(docNum, nzDoc, writer);
471	writeDocAsCSVRecord(docNum, nzDoc, csvWriter);
472	}
473	// all other table row data start at 1 for docNum
474	for (Document doc : output) {
475	//System.out.println(doc);
476	writeDocAsJsonRecord(++docNum, doc, writer);
477	writeDocAsCSVRecord(++docNum, doc, csvWriter);
478	}
479
480	logger.info("@@@ Wrote out table into file: " + Utility.getFilePath(outFile) + " and .csv");
481	} catch(UncheckedIOException ioe) {
482	logger.error("Caught UncheckedIOException: " + ioe.getMessage(), ioe);
483	}
484	catch(Exception e) {
485	logger.error("Could not write table to file " + outFile + " or .csv equivalent" , e);
486	}
487	}
488
489	return tableNames;
490	}
491
492	public Document getNZTableRowData(MongoCollection<Document> collection, int tableNum) {
493
494	Document nzRowData = null;
495	switch(tableNum) {
496	case 1: case 2: case 3: case 4:
497
498	break;
499
500	//case 5:
501	//filterQueryStr = "{numPagesContainingMRI: {$gt: 0}}";
502	//case 6:
503	//filterQueryStr = "{numPagesInMRI: {$gt: 0}}";
504	case 5: case 6:
505	String filterQueryStr = (tableNum == 5) ?
506	"{numPagesContainingMRI: {$gt: 0}}" : "{numPagesInMRI: {$gt: 0}}";
507
508	/* Get NZ only table data.
509	Can be numPagesContainingMRI or numPagesInMRI > 0 depending on filterQueryStr.
510
511	db.Websites.aggregate([
512	{
513	$match: {
514	$and: [
515	{numPagesContainingMRI: {$gt: 0}},
516	{$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz$/}]}
517	]
518	}
519	},
520	{ $unwind: "$geoLocationCountryCode" },
521	{
522	$group: {
523	_id: "NZ",
524	count: { $sum: 1 },
525	//domain: { $addToSet: '$domain' },
526	numPagesInMRICount: { $sum: '$numPagesInMRI' },
527	numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
528	totalPagesAcrossMatchingSites: { $sum: '$totalPages'}
529	}
530	},
531	{ $sort : { count : -1} }
532	]);
533
534	*/
535
536	Bson orQuery = or(
537	BasicDBObject.parse("{geoLocationCountryCode: \"NZ\"}"),
538	BasicDBObject.parse("{domain: /\\.nz$/}")
539	);
540	Bson andQuery = and(
541	BasicDBObject.parse(filterQueryStr), // e.g."{numPagesContainingMRI: {$gt: 0}}"
542	orQuery
543	);
544	AggregateIterable<Document> output = collection.aggregate(Arrays.asList(
545	match(andQuery),
546	unwind("$geoLocationCountryCode"),
547	group("NZ", Arrays.asList(
548	sum("count", 1),
549	/addToSet("domain", "$domain"),/
550	sum("numPagesInMRICount", "$numPagesInMRI"),
551	sum("numPagesContainingMRICount", "$numPagesContainingMRI"),
552	sum("totalPagesAcrossMatchingSites", "$totalPages"))),
553	sort(BasicDBObject.parse("{count : -1}"))
554	));
555
556	nzRowData = output.first(); // first and only document in result
557
558	break;
559
560
561	default: logger.error("Unknown table number: " + tableNum);
562	}
563
564	return nzRowData;
565	}
566
567	public AggregateIterable<Document> getTable(MongoCollection<Document> collection, int tableNum)
568	{
569	//String filterQueryStr = "{numPagesContainingMRI: {$gt: 0}}"; // only used if tableNum = 5\|6
570
571	AggregateIterable<Document> output = null;
572	Bson orQuery = null;
573	Bson andQuery = null;
574
575	switch(tableNum) {
576
577	case 1:
578	/* 1table_allCrawledSites -
579
580	db.Websites.aggregate([
581	{ $unwind: "$geoLocationCountryCode" },
582	{
583	$group: {
584	_id: "$geoLocationCountryCode",
585	count: { $sum: 1 },
586	//domain: { $addToSet: '$domain' },
587	numPagesInMRICount: { $sum: '$numPagesInMRI' },
588	numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
589	totalPagesAcrossSites: { $sum: '$totalPages'}
590	}
591	},
592	{ $sort : { count : -1} }
593	]);
594	*/
595	output = collection.aggregate(Arrays.asList(
596	//match(BasicDBObject.parse("{urlContainsLangCodeInPath:true}")),
597	unwind("$geoLocationCountryCode"),
598	group("$geoLocationCountryCode", Arrays.asList(
599	sum("count", 1),
600	/addToSet("domain", "$domain"),/
601	sum("numPagesInMRICount", "$numPagesInMRI"),
602	sum("numPagesContainingMRICount", "$numPagesContainingMRI"),
603	sum("totalPagesAcrossMatchingSites", "$totalPages"))),
604	sort(BasicDBObject.parse("{count : -1}"))
605	));
606	break;
607
608	case 2:
609	/*
610	db.Websites.aggregate([
611	{ $match: { numPagesInMRI: {$gt: 0} } },
612	{ $unwind: "$geoLocationCountryCode" },
613	{
614	$group: {
615	_id: {$toLower: '$geoLocationCountryCode'}, // ignore toLower: _id:'$geoLocationCountryCode'
616	count: { $sum: 1 },
617	//domain: { $addToSet: '$domain' },
618	numPagesInMRICount: { $sum: '$numPagesInMRI' },
619	numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
620	totalPagesAcrossSitesWithPositiveMRICount: { $sum: '$totalPages'}
621	}
622	},
623	{ $sort : { count : -1} }
624	]);
625	*/
626	output = collection.aggregate(Arrays.asList(
627	match(BasicDBObject.parse("{ numPagesInMRI: {$gt: 0} }")),
628	unwind("$geoLocationCountryCode"),
629	group("$geoLocationCountryCode", Arrays.asList(
630	sum("count", 1),
631	/addToSet("domain", "$domain"),/
632	sum("numPagesInMRICount", "$numPagesInMRI"),
633	sum("numPagesContainingMRICount", "$numPagesContainingMRI"),
634	sum("totalPagesAcrossMatchingSites", "$totalPages"))),
635	sort(BasicDBObject.parse("{count : -1}"))
636	));
637	break;
638
639	case 3:
640	/*
641	db.Websites.aggregate([
642	{
643	$match: { numPagesContainingMRI: {$gt: 0} }
644	},
645	{ $unwind: "$geoLocationCountryCode" },
646	{
647	$group: {
648	_id: '$geoLocationCountryCode',
649	count: { $sum: 1 },
650	//domain: { $addToSet: '$domain' },
651	numPagesInMRICount: { $sum: '$numPagesInMRI' },
652	numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
653	totalPagesAcrossSitesWithPosContainsMRI: { $sum: '$totalPages'}
654	}
655	},
656	{ $sort : { count : -1} }
657	]);
658	*/
659	output = collection.aggregate(Arrays.asList(
660	match(BasicDBObject.parse("{ numPagesContainingMRI: {$gt: 0} }")),
661	unwind("$geoLocationCountryCode"),
662	group("$geoLocationCountryCode", Arrays.asList(
663	sum("count", 1),
664	/addToSet("domain", "$domain"),/
665	sum("numPagesInMRICount", "$numPagesInMRI"),
666	sum("numPagesContainingMRICount", "$numPagesContainingMRI"),
667	sum("totalPagesAcrossMatchingSites", "$totalPages"))),
668	sort(BasicDBObject.parse("{count : -1}"))
669	));
670	break;
671
672	case 4:
673	/*
674	db.Websites.aggregate([
675	{
676	$match: {
677	$and: [
678	{numPagesContainingMRI: {$gt: 0}},
679	{$or: [{geoLocationCountryCode: /(NZ\|AU)/}, {domain: /\.nz$/}, {urlContainsLangCodeInPath: false}]}
680	]
681	}
682	},
683	{ $unwind: "$geoLocationCountryCode" },
684	{
685	$group: {
686	_id: {$toLower: '$geoLocationCountryCode'},
687	count: { $sum: 1 },
688	//domain: { $addToSet: '$domain' },
689	numPagesInMRICount: { $sum: '$numPagesInMRI' },
690	numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
691	totalPagesAcrossMatchingSites: { $sum: '$totalPages'}
692	}
693	},
694	{ $sort : { count : -1} }
695	]);
696	*/
697
698	orQuery = or(
699	BasicDBObject.parse("{geoLocationCountryCode: /(NZ\|AU)/}"),
700	//BasicDBObject.parse("{geoLocationCountryCode: \"NZ\"}"),
701	BasicDBObject.parse("{domain: /\\.nz$/}"),
702	BasicDBObject.parse("{urlContainsLangCodeInPath: false}")
703	);
704	andQuery = and(
705	BasicDBObject.parse("{numPagesContainingMRI: {$gt: 0}}"),
706	orQuery);
707	output = collection.aggregate(Arrays.asList(
708	match(andQuery),
709	unwind("$geoLocationCountryCode"),
710	group("$geoLocationCountryCode", Arrays.asList(
711	sum("count", 1),
712	/addToSet("domain", "$domain"),/
713	sum("numPagesInMRICount", "$numPagesInMRI"),
714	sum("numPagesContainingMRICount", "$numPagesContainingMRI"),
715	sum("totalPagesAcrossMatchingSites", "$totalPages"))),
716	sort(BasicDBObject.parse("{count : -1}"))
717	));
718	break;
719	//case 5:
720	//filterQueryStr = "{numPagesContainingMRI: {$gt: 0}}";
721	//case 6:
722	//filterQueryStr = "{numPagesInMRI: {$gt: 0}}";
723	case 5: case 6:
724	String filterQueryStr = (tableNum == 5) ?
725	"{numPagesContainingMRI: {$gt: 0}}" : "{numPagesInMRI: {$gt: 0}}";
726	/*
727	Table of count by countryCode of sites with numPagesContainingMRI > 0
728	(or numPagesInMRI > 0).
729	Just do OVERSEAS here, NZ handled separately
730
731	db.Websites.aggregate([
732	{
733	$match: {
734	$and: [
735	{geoLocationCountryCode: {$ne: "NZ"}},
736	{domain: {$not: /\.nz$/}},
737	{numPagesContainingMRI: {$gt: 0}}
738	]
739	}
740	},
741	{ $unwind: "$geoLocationCountryCode" },
742	{
743	$group: {
744	_id: '$geoLocationCountryCode',
745	count: { $sum: 1 },
746	//domain: { $addToSet: '$domain' },
747	numPagesInMRICount: { $sum: '$numPagesInMRI' },
748	numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
749	totalPagesAcrossMatchingSites: { $sum: '$totalPages'}
750	}
751	},
752	{ $sort : { count : -1} }
753	]);
754	*/
755
756	andQuery = and(
757	BasicDBObject.parse("{geoLocationCountryCode: {$ne: \"NZ\"}}"),
758	BasicDBObject.parse("{domain: {$not: /\\.nz$/}}"),
759	BasicDBObject.parse(filterQueryStr) // e.g. "{numPagesContainingMRI: {$gt: 0}}"
760	);
761	output = collection.aggregate(Arrays.asList(
762	match(andQuery),
763	unwind("$geoLocationCountryCode"),
764	group("$geoLocationCountryCode", Arrays.asList(
765	sum("count", 1),
766	/addToSet("domain", "$domain"),/
767	sum("numPagesInMRICount", "$numPagesInMRI"),
768	sum("numPagesContainingMRICount", "$numPagesContainingMRI"),
769	sum("totalPagesAcrossMatchingSites", "$totalPages"))),
770	sort(BasicDBObject.parse("{count : -1}"))
771	));
772
773	break;
774	default: logger.error("Unknown table number: " + tableNum);
775	}
776
777	return output;
778
779	}
780
781
782
783	/**
784	* called by lambda forEach() call on Document objects to write them out to a file.
785	* Have to deal with unreported exceptions here that can't be dealt with when doing
786	* the actual forEach(). See
787	* https://stackoverflow.com/questions/39090292/how-to-cleanly-deal-with-unreported-exception-ioexception-in-stream-foreach
788	*/
789	public void writeDoc(int docNum, Document doc, Writer writer) throws UncheckedIOException {
790
791	// If there's a domain field in the json Doc, sort this domain listing alphabetically
792	Object domainList = doc.remove("domain");
793	///logger.info("CLASS: " + domainList.getClass());
794	if(domainList != null) {
795	List sortedList = sortAlphabetically(domainList);
796	doc.put("uniqueCount", sortedList.size());
797	doc.put("domain", sortedList);
798	}
799
800	//OLD WAY: writer.write(doc.toJson(new JsonWriterSettings(JsonMode.STRICT, true)) + NEWLINE);
801	// Can't control json output to add newlines after each array element,
802	// no matter which JsonMode is used.
803
804	// https://mongodb.github.io/mongo-java-driver/3.9/javadoc/index.html?org/bson/json/JsonWriterSettings.html
805	// Still can't control array element output,
806	// but this way uses newer mongo java driver 3.9(.1). Tried its various JsonModes too:
807	//JsonWriterSettings writeSettings = new JsonWriterSettings();
808	//writeSettings.builder().outputMode(JsonMode.SHELL).indent(true).build();
809	//writer.write(doc.toJson(writeSettings) + NEWLINE);
810
811	// Not the JsonWriter of mongodb java driver:
812	// https://stackoverflow.com/questions/54746814/jsonwriter-add-a-new-line
813
814	// Have to use gson's pretty print to produce a json string that contains
815	// newlines after every array element in the json:
816
817	String jsonStr = prettyPrintJson(doc.toJson());
818	//System.err.println(jsonStr);
819	try {
820	writer.write("/* " + docNum + " */" + NEWLINE);
821	writer.write(jsonStr + NEWLINE + NEWLINE);
822	} catch (IOException ex) {
823	//throw ex;
824	throw new UncheckedIOException(ex);
825	}
826	}
827
828	public void writeDocAsJsonRecord(int docNum, Document doc, Writer writer) throws UncheckedIOException {
829	String jsonStr = prettyPrintJson(doc.toJson());
830	//System.err.println(jsonStr);
831	try {
832	writer.write("/* " + docNum + " */\n" + jsonStr + NEWLINE);
833	} catch (IOException ex) {
834	//throw ex;
835	throw new UncheckedIOException(ex);
836	}
837	}
838
839	// TODO
840	//public void writeDocToJsonAndCSV(int docNum, Document doc, Writer writer, CSVPrinter csvWriter) throws UncheckedIOException {
841	public void writeDocAsCSVRecord(int docNum, Document doc, CSVPrinter csvWriter) throws UncheckedIOException {
842	String jsonStr = doc.toJson();
843	JsonParser parser = new JsonParser();
844	JsonElement json = parser.parse(jsonStr);
845
846	JsonObject jsonObj = (JsonObject)json;
847
848	String countryCode = jsonObj.get("_id").getAsString();
849	int siteCount = jsonObj.get("count").getAsInt();
850	int numPagesInMRICount = jsonObj.get("numPagesInMRICount").getAsInt();
851	int numPagesContainingMRICount = jsonObj.get("numPagesContainingMRICount").getAsInt();
852	int totalPagesAcrossMatchingSites = jsonObj.get("totalPagesAcrossMatchingSites").getAsInt();
853
854	//System.err.println(jsonStr);
855	try {
856	//writer.write("/* " + docNum + " */\n" + prettyPrintJson(jsonStr) + NEWLINE);
857	csvWriter.printRecord(countryCode, siteCount, numPagesInMRICount, numPagesContainingMRICount, totalPagesAcrossMatchingSites);
858	} catch (IOException ex) {
859	//throw ex;
860	throw new UncheckedIOException(ex);
861	}
862	}
863
864	public String prettyPrintJson(String jsonStr) {
865	Gson gson = new GsonBuilder().setPrettyPrinting().create();
866	JsonParser jp = new JsonParser();
867	JsonElement je = jp.parse(jsonStr);
868	String prettyJsonString = gson.toJson(je);
869	return prettyJsonString;
870	}
871
872	private List sortAlphabetically(Object list) {
873	//BsonArray domainList = (BsonArray)list;
874	ArrayList<BsonValue> domainList = (ArrayList<BsonValue>)list;
875	/*
876	// for(String domain : domainList) {
877	for(int i = domainList.size() - 1; i >= 0; i--) {
878	BsonString domain = domainList.get(i).asString();
879	String domainStr = Utility.stripProtocolAndWWWFromURL(domain.toString());
880	domainList.set(i, new BsonString(domainStr));
881	}
882	Collections.sort(domainList);
883	// still need to get rid of non-unique values...
884	*/
885
886	TreeSet<String> set = new TreeSet<String>();
887	for(int i = domainList.size() - 1; i >= 0; i--) {
888	///BsonValue val = domainList.get(i);
889	///BsonString domain = val.asString();
890	//BsonString domain = domainList.get(i).asString();
891	//String domainStr = Utility.stripProtocolAndWWWFromURL(domain.toString());
892	Object domain = domainList.get(i);
893	String domainStr = Utility.stripProtocolAndWWWFromURL(domain.toString());
894	set.add(domainStr);
895	//domainList.set(i, new BsonString(domainStr));
896	}
897
898	domainList = new ArrayList<BsonValue>(); //new BsonArray();
899	for(String s : set) {
900	domainList.add(new BsonString(s));
901	}
902	return domainList;
903	}
904
905	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBQueryer.java@ 33919

Download in other formats: