Context Navigation

source: other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBQueryer.java@ 33913

Last change on this file since 33913 was 33913, checked in by ak19, 4 years ago
Adjusted table mongodb query statements to be more exact, but same results. 2. Adjusted code to not treat Australia specially, as the AU site with mi in URL path has now shifted to US. 3. Differences in geoLocation results from previous mongoDB ingest to present one documented for cases not dealing with mi in URL path of overseas domains. 4.
File size: 30.1 KB

Line
1	package org.greenstone.atea;
2
3	//import org.bson.BSONObject;
4
5	import com.mongodb.client.AggregateIterable;
6	import com.mongodb.client.MongoCollection;
7
8	// to use collection.find() filters like eq(), regex() etc
9	import static com.mongodb.client.model.Filters.*;
10	// to use collection.find().projection() filters like include() etc
11	import static com.mongodb.client.model.Projections.*;
12	// to use aggregation functions like unwind(), match(), sort() etc
13	import static com.mongodb.client.model.Aggregates.*;
14	// to use functions like sum() and addToSet() within aggregation functions
15	import static com.mongodb.client.model.Accumulators.*;
16
17	//import org.bson.conversions.Bson;
18	import com.mongodb.BasicDBObject;
19
20
21	import com.mongodb.Block;
22
23	import org.bson.BsonArray;
24	import org.bson.BsonString;
25	import org.bson.BsonValue;
26	import org.bson.Document;
27	import org.bson.conversions.Bson;
28	import org.bson.json.JsonMode;
29	import org.bson.json.JsonWriterSettings;
30
31	import com.mongodb.util.JSON;
32	//import com.mongodb.DBObject;
33
34
35	import com.google.gson.*; // for pretty printing
36
37	import java.io.BufferedReader;
38	import java.io.BufferedWriter;
39	import java.io.File;
40	import java.io.FileReader;
41	import java.io.FileWriter;
42	import java.io.IOException;
43	import java.io.UncheckedIOException;
44	import java.io.Writer;
45	import javax.xml.ws.Holder;
46
47
48	import java.util.Arrays;
49	import java.util.ArrayList;
50	import java.util.List;
51	import java.util.TreeSet;
52
53
54	import org.apache.log4j.Logger;
55	import org.apache.commons.csv.*;
56
57	/**
58	* https://www.tutorialspoint.com/mongodb/mongodb_java.htm
59	*
60	* TO COMPILE:
61	* maori-lang-detection/src$
62	* javac -cp ".:../conf:../lib/*" org/greenstone/atea/MongoDBQueryer.java
63	*
64	* TO RUN:
65	* java -cp ".:../conf:../lib/*" org.greenstone.atea.MongoDBQueryer
66	*
67	* Manually connecting to mongodb from client:
68	* mongo 'mongodb://mongodb.cms.waikato.ac.nz:27017' -u USERNAME -p
69	* Then after connecting with pwd, type:
70	* use DBNAME
71	*
72	* Or connect to mongodb and specify db in one statement:
73	* mongo 'mongodb://mongodb.cms.waikato.ac.nz:27017/DBNAME?authSource=admin' -u USERNAME -p
74	*
75	* Some links:
76	* - https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection
77	* - https://docs.mongodb.com/manual/reference/glossary/ (particularly "collection")
78	* - https://tecadmin.net/tutorial/mongodb/drop-collection/
79	* IMPORTANT LINK:
80	* - https://www.mongodb.com/blog/post/6-rules-of-thumb-for-mongodb-schema-design-part-1
81	*
82	* API:
83	* - https://mongodb.github.io/mongo-java-driver/3.4/javadoc/?com/mongodb/client/MongoCollection.html#find--
84	* - examples: https://mongodb.github.io/mongo-java-driver/3.4/driver/getting-started/quick-start/
85	*/
86	public class MongoDBQueryer extends MongoDBAccess {
87
88	private static Logger logger = Logger.getLogger(org.greenstone.atea.MongoDBQueryer.class.getName());
89
90	public static final String NEWLINE = System.getProperty("line.separator");
91
92	/** mongodb filter types to execute */
93	public static final int IS_MRI = 0;
94	public static final int CONTAINS_MRI = 1;
95
96	/** Some reused fieldnames in the Websites collection */
97	private static final String FILTER_NUMPAGES_IN_MRI = "numPagesInMRI";
98	private static final String FILTER_NUMPAGES_CONTAINING_MRI = "numPagesContainingMRI";
99
100
101
102	public MongoDBQueryer() throws Exception {
103	super();
104	}
105
106
107	public ArrayList<String> queryAllMatchingIsMRIURLs(String domain) {
108	return queryAllMatchingURLsFilteredBy(domain, IS_MRI);
109	}
110	public ArrayList<String> queryAllMatchingcontainsMRIURLs(String domain) {
111	return queryAllMatchingURLsFilteredBy(domain, CONTAINS_MRI);
112	}
113
114	/**
115	* Java mongodb find: https://mongodb.github.io/mongo-java-driver/3.4/driver/getting-started/quick-start/
116	* Java mongodb find filters: https://mongodb.github.io/mongo-java-driver/3.4/javadoc/?com/mongodb/client/model/Filters.html
117	* Java mongodb projection: https://stackoverflow.com/questions/44894497/retrieving-data-with-mongodb-java-driver-3-4-using-find-method-with-projection
118	* mongodb projection: https://docs.mongodb.com/v3.2/reference/method/db.collection.find/#db.collection.find
119	*
120	* Parse MongoDB query into Java: https://stackoverflow.com/questions/17326747/parsing-strings-to-mongodb-query-documents-with-operators-in-java
121	* Maybe also https://stackoverflow.com/questions/48000891/parse-mongodb-json-query-in-java-with-multiple-criteria
122	* https://stackoverflow.com/questions/55029222/parse-mongodb-query-to-java
123	* http://pingax.com/trick-convert-mongo-shell-query-equivalent-java-objects/
124	*/
125	public ArrayList<String> queryAllMatchingURLsFilteredBy(String domain, int filterType) {
126
127	final ArrayList<String> urlsList = new ArrayList<String>();
128
129	// remove any http(s)://(www.) from the start of URL first
130	// since it goes into a regex
131	domain = Utility.stripProtocolAndWWWFromURL(domain);
132
133	// load the "webpages" db table
134	// in mongodb, the equivalent of db tables are called 'collections'
135	MongoCollection<Document> collection = getWebpagesCollection();
136
137	// code we'll execute in Iterable.forEach() below
138	// see also https://www.baeldung.com/foreach-java
139	Block<Document> storeURL = new Block<Document>() {
140	@Override
141	public void apply(final Document document) {
142	//System.out.println(document.toJson());
143	String url = document.getString("URL");
144	// add to our urlsList
145	//System.out.println(url);
146	urlsList.add(url);
147	}
148	};
149
150
151	// Run the following mongodb query:
152	// db.getCollection('Webpages').find({URL: /domain/, isMRI: true}, {URL: 1, _id: 0})
153
154	// 1. One way that works:
155	//collection.find(and(eq("isMRI", true), regex("URL", pattern))).projection(fields(include("URL"), excludeId())).forEach(storeURL);
156
157	// 2. Another way:
158	//String query = "{URL: /DOMAIN/, isMRI: true}";
159	String query = "{URL: /DOMAIN/, ";
160	if(filterType == IS_MRI) {
161	query += "isMRI: true}";
162	} else if(filterType == CONTAINS_MRI) {
163	query += "containsMRI: true}";
164	}
165
166	domain = domain.replace(".", "\\."); // escape dots in domain for regex
167	query = query.replace("DOMAIN", domain);
168
169	//System.err.println("Executing find query: " + query);
170
171	BasicDBObject findObj = BasicDBObject.parse(query);
172	BasicDBObject projectionObj = BasicDBObject.parse("{URL: 1, _id: 0}");
173
174
175	collection.find(findObj).projection(projectionObj).forEach(storeURL);
176
177	return urlsList;
178	}
179
180	/**
181	* RUNNING A MONGODB COLLECTION.AGGREGATE() in JAVA:
182	*
183	* https://stackoverflow.com/questions/31643109/mongodb-aggregation-with-java-driver
184	* https://stackoverflow.com/questions/48000891/parse-mongodb-json-query-in-java-with-multiple-criteria
185	* Not Java: https://stackoverflow.com/questions/39060221/a-pipeline-stage-specification-object-must-contain-exactly-one-field-with-php-mo
186	*
187	* (https://stackoverflow.com/questions/55029222/parse-mongodb-query-to-java)
188	* https://www.programcreek.com/java-api-examples/?api=com.mongodb.client.model.Aggregates
189	* On using group(TExpression) inside collection.aggregate().
190	*
191	* For forEach lamba expressions, see also https://www.baeldung.com/foreach-java
192	* and https://www.javatpoint.com/java-8-foreach
193	* and https://stackoverflow.com/questions/47979978/ambiguous-reference-to-foreach-when-listing-mongodbs-database-in-java
194	*
195	* Count of NZ (incl .nz TLD) websites containing a positive number of sentences in MRI,
196	* listing all the base domain strings (unsorted with protocol and any www)
197	* and total counts of numPagesInMRI and numPagesContainingMRI across all these
198	* matching sites.
199	*
200	* The mongodb aggregate() we want to run this time:
201	*
202	db.Websites.aggregate([
203	{
204	$match: {
205	$and: [
206	{numPagesContainingMRI: {$gt: 0}},
207	{$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz/}]}
208	]
209	}
210	},
211	{ $unwind: "$geoLocationCountryCode" },
212	{
213	$group: {
214	_id: "nz",
215	count: { $sum: 1 },
216	domain: { $addToSet: '$domain' }
217	}
218	},
219	{ $sort : { count : -1} }
220	]);
221	*/
222	public void aggregateContainsMRIForNZ(Writer writer, int filterType) throws IOException {
223	// working with the WebSites collection, not WebPages collection!
224	MongoCollection<Document> collection = getWebsitesCollection();
225
226	String mriFilterString = (filterType == CONTAINS_MRI) ? "{numPagesContainingMRI: {$gt: 0}}" : "{numPagesInMRI: {$gt: 0}}";
227
228	// Want a counter,
229	// but lambda expressions can only take final variables and those can't be incremented
230	// But can use an array to store incrementable counter or Holder type
231	// https://stackoverflow.com/questions/28790784/java-8-preferred-way-to-count-iterations-of-a-lambda
232	Holder<Integer> docNum = new Holder<>(0);
233
234	Bson orQuery = or(
235	BasicDBObject.parse("{geoLocationCountryCode: \"NZ\"}"),
236	BasicDBObject.parse("{domain: /\\.nz$/}")
237	);
238	Bson andQuery = and(
239	BasicDBObject.parse(mriFilterString),
240	orQuery);
241
242	// Hopefully the lambda expression (forEach()) at end means
243	// we write out each result Document as we get it
244	collection.aggregate(Arrays.asList(
245	match(andQuery),
246	unwind("$geoLocationCountryCode"),
247	group("NZ", Arrays.asList(sum("count", 1),
248	addToSet("domain", "$domain"))),
249	sort(BasicDBObject.parse("{count : -1}"))
250	)).forEach((Block<Document>)doc -> writeDoc(docNum.value++, doc, writer));
251
252	// should only have one doc for NZ since it's a count by geolocation.
253
254	return;
255	}
256
257
258	/**
259	* Count of overseas (non-NZ and non-.nz TLD) websites
260	* containing a positive number of sentences in MRI,
261	* listing all the base domain strings (unsorted with protocol and any www)
262	* and total counts of numPagesInMRI and numPagesContainingMRI across all these
263	* matching sites. Regardless of whether there's an mi in the URL path of any or not.
264	*
265	* The aggregate() we want to run this time:
266	*
267	db.Websites.aggregate([
268	{
269	$match: {
270	$and: [
271	{geoLocationCountryCode: {$ne: "NZ"}},
272	{domain: {$not: /\.nz/}},
273	{numPagesContainingMRI: {$gt: 0}}
274	]
275	}
276	},
277	{ $unwind: "$geoLocationCountryCode" },
278	{
279	$group: {
280	_id: {$toLower: '$geoLocationCountryCode'},
281	count: { $sum: 1 },
282	domain: { $addToSet: '$domain' }
283	}
284	},
285	{ $sort : { count : -1} }
286	]);
287	*/
288	public void aggregateContainsMRIForOverseas(Writer writer, int filterType)
289	throws UncheckedIOException {
290	// working with the WebSites collection, not WebPages collection!
291	MongoCollection<Document> collection = getWebsitesCollection();
292
293	String mriFilterString = (filterType == CONTAINS_MRI) ? "{numPagesContainingMRI: {$gt: 0}}" : "{numPagesInMRI: {$gt: 0}}";
294
295	// Want a counter,
296	// but lambda expressions can only take final variables and those can't be incremented
297	// But can use an array to store incrementable counter or Holder
298	// https://stackoverflow.com/questions/28790784/java-8-preferred-way-to-count-iterations-of-a-lambda
299	Holder<Integer> docNum = new Holder<>(1);
300
301	Bson andQuery = and(
302	BasicDBObject.parse("{geoLocationCountryCode: {$ne: \"NZ\"}}"),
303	BasicDBObject.parse("{domain: {$not: /\\.nz$/}}"),
304	BasicDBObject.parse(mriFilterString));
305
306	collection.aggregate(Arrays.asList(
307	match(andQuery),
308	unwind("$geoLocationCountryCode"),
309	group("$geoLocationCountryCode", Arrays.asList(sum("count", 1),
310	addToSet("domain", "$domain"))),
311	sort(BasicDBObject.parse("{count : -1}"))
312	)).forEach((Block<Document>)doc -> writeDoc(docNum.value++, doc, writer));
313
314	return;
315	}
316
317
318	/** Count by country code of overseas (non-NZ and non-nz TLD) websites
319	* containing a positive number of sentences in MRI,
320	* listing all the base domain strings (unordered and with protocol and any www)
321	* and total counts of numPagesInMRI and numPagesContainingMRI across all these
322	* matching sites.
323	*
324	* The aggregate() we want to run this time:
325	*
326	db.Websites.aggregate([
327	{
328	$match: {
329	$and: [
330	{geoLocationCountryCode: {$ne: "NZ"}},
331	{domain: {$not: /\.nz/}},
332	{numPagesContainingMRI: {$gt: 0}},
333	{$or: [{geoLocationCountryCode: "AU"}, {urlContainsLangCodeInPath: false}]}
334	]
335	}
336	},
337	{ $unwind: "$geoLocationCountryCode" },
338	{
339	$group: {
340	_id: {$toLower: '$geoLocationCountryCode'},
341	count: { $sum: 1 },
342	domain: { $addToSet: '$domain' }
343	}
344	},
345	{ $sort : { count : -1} }
346	]);
347	*/
348	public void aggregateContainsMRIForOverseas(Writer writer, int filterType,
349	boolean isMiInURLPath) throws UncheckedIOException
350	{
351	// working with the WebSites collection, not WebPages collection!
352	MongoCollection<Document> collection = getWebsitesCollection();
353
354	String mriFilterString = (filterType == CONTAINS_MRI) ? "{numPagesContainingMRI: {$gt: 0}}" : "{numPagesInMRI: {$gt: 0}}";
355
356	// Want a counter,
357	// but lambda expressions can only take final variables and those can't be incremented
358	// But can use an array to store incrementable counter or Holder
359	// https://stackoverflow.com/questions/28790784/java-8-preferred-way-to-count-iterations-of-a-lambda
360	Holder<Integer> docNum = new Holder<>(1);
361
362	/*
363	Bson orQuery = or(
364	BasicDBObject.parse("{geoLocationCountryCode: \"AU\"}"),
365	BasicDBObject.parse("{urlContainsLangCodeInPath: "+ isMiInURLPath +"}")
366	// e.g. "{urlContainsLangCodeInPath: false}"
367	);
368	*/
369	Bson andQuery = and(
370	BasicDBObject.parse(mriFilterString),
371	BasicDBObject.parse("{geoLocationCountryCode: {$ne: \"NZ\"}}"),
372	BasicDBObject.parse("{domain: {$not: /\\.nz$/}}"),
373	BasicDBObject.parse("{urlContainsLangCodeInPath: "+ isMiInURLPath +"}"));//orQuery);
374
375	collection.aggregate(Arrays.asList(
376	match(andQuery), //match(BasicDBObject.parse(matchQuery))
377	// match((List<DBObject>)JSON.parse(matchQuery)),
378	unwind("$geoLocationCountryCode"),
379	group("$geoLocationCountryCode", Arrays.asList(sum("count", 1),
380	addToSet("domain", "$domain"))),
381	sort(BasicDBObject.parse("{count : -1}"))
382	)).forEach((Block<Document>)doc -> writeDoc(docNum.value++, doc, writer));
383
384	// casting to Block<Document> necessary because otherwise we see the error at
385	// https://stackoverflow.com/questions/47979978/ambiguous-reference-to-foreach-when-listing-mongodbs-database-in-java
386
387	// Less efficient way is to keep all the results in memory and then
388	// write them out one at a time
389	/*
390	AggregateIterable<Document> output
391	= collection.aggregate(Arrays.asList(
392	match(andQuery), //match(BasicDBObject.parse(matchQuery))
393	// match((List<DBObject>)JSON.parse(matchQuery)),
394	unwind("$geoLocationCountryCode"),
395	group("$geoLocationCountryCode", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))),
396	sort(BasicDBObject.parse("{count : -1}"))
397	));
398
399
400	for (Document doc : output) {
401	//System.out.println(doc);
402	System.out.println(doc.toJson());
403
404	}
405	*/
406	return;
407	}
408
409	/** Perform the aggregates for writing out the summary tables. */
410	public void writeTables(File outFolder) {
411	// In this function, we're always dealing with the Websites mongodb collection.
412	MongoCollection<Document> collection = getWebsitesCollection();
413
414	String[] tableNames = { "", "1table_allCrawledSites", "2table_sitesWithPagesInMRI",
415	"3table_sitesWithPagesContainingMRI", "4table_containsMRI_exclTentativeProductSites",
416	"5table_sitesWithPagesContainingMRI_allNZGrouped"
417	};
418	for (int tableNum = 1; tableNum < tableNames.length; tableNum++) {
419	File outFile = new File(outFolder, tableNames[tableNum] + ".json");
420	File csvFile = new File(outFolder, tableNames[tableNum] + ".csv");
421	try (
422	Writer writer = new BufferedWriter(new FileWriter(outFile));
423	CSVPrinter csvWriter = new CSVPrinter(new FileWriter(csvFile), CSVFormat.DEFAULT.withQuoteMode(QuoteMode.MINIMAL)); // quote ALL vs MINIMAL vs NON_NUMERIC fields
424	) {
425
426	// Write out the CSV column headings
427	// https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVPrinter.html
428	csvWriter.printRecord("countryCode", "siteCount",
429	"numPagesInMRI count","numPagesContainingMRICount",
430	"totalPagesAcrossMatchingSites"/, "domain"/);
431
432	AggregateIterable<Document> output = getTable(collection, tableNum); //doTable1().forEach((Block<Document>)doc -> writeDoc(doc, writer));
433
434
435	int docNum = 0;
436	// get any NZ specific row's data, if it exists for this table
437	// and add that as the docNum="0th" doc
438	Document nzDoc = getNZTableRowData(collection, tableNum);
439	if(nzDoc != null) {
440	writeDocAsJsonRecord(docNum, nzDoc, writer);
441	writeDocAsCSVRecord(docNum, nzDoc, csvWriter);
442	}
443	// all other table row data start at 1 for docNum
444	for (Document doc : output) {
445	//System.out.println(doc);
446	writeDocAsJsonRecord(++docNum, doc, writer);
447	writeDocAsCSVRecord(++docNum, doc, csvWriter);
448	}
449
450	logger.info("@@@ Wrote out table into file: " + Utility.getFilePath(outFile) + " and .csv");
451	} catch(UncheckedIOException ioe) {
452	logger.error("Caught UncheckedIOException: " + ioe.getMessage(), ioe);
453	}
454	catch(Exception e) {
455	logger.error("Could not write table to file " + outFile + " or .csv equivalent" , e);
456	}
457	}
458	}
459
460	public Document getNZTableRowData(MongoCollection<Document> collection, int tableNum) {
461
462	Document nzRowData = null;
463	switch(tableNum) {
464	case 1: case 2: case 3: case 4:
465
466	break;
467
468	case 5:
469	/* Get NZ only table data:
470	db.Websites.aggregate([
471	{
472	$match: {
473	$and: [
474	{numPagesContainingMRI: {$gt: 0}},
475	{$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz$/}]}
476	]
477	}
478	},
479	{ $unwind: "$geoLocationCountryCode" },
480	{
481	$group: {
482	_id: "NZ",
483	count: { $sum: 1 },
484	//domain: { $addToSet: '$domain' },
485	numPagesInMRICount: { $sum: '$numPagesInMRI' },
486	numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
487	totalPagesAcrossMatchingSites: { $sum: '$totalPages'}
488	}
489	},
490	{ $sort : { count : -1} }
491	]);
492
493	*/
494	Bson orQuery = or(
495	BasicDBObject.parse("{geoLocationCountryCode: \"NZ\"}"),
496	BasicDBObject.parse("{domain: /\\.nz$/}")
497	);
498	Bson andQuery = and(
499	BasicDBObject.parse("{numPagesContainingMRI: {$gt: 0}}"),
500	orQuery
501	);
502	AggregateIterable<Document> output = collection.aggregate(Arrays.asList(
503	match(andQuery),
504	unwind("$geoLocationCountryCode"),
505	group("NZ", Arrays.asList(
506	sum("count", 1),
507	/addToSet("domain", "$domain"),/
508	sum("numPagesInMRICount", "$numPagesInMRI"),
509	sum("numPagesContainingMRICount", "$numPagesContainingMRI"),
510	sum("totalPagesAcrossMatchingSites", "$totalPages"))),
511	sort(BasicDBObject.parse("{count : -1}"))
512	));
513
514	nzRowData = output.first(); // first and only document in result
515
516	break;
517
518	default: logger.error("Unknown table number: " + tableNum);
519	}
520
521	return nzRowData;
522	}
523
524	public AggregateIterable<Document> getTable(MongoCollection<Document> collection, int tableNum)
525	{
526
527	AggregateIterable<Document> output = null;
528	Bson orQuery = null;
529	Bson andQuery = null;
530
531	switch(tableNum) {
532
533	case 1:
534	/* 1table_allCrawledSites -
535
536	db.Websites.aggregate([
537	{ $unwind: "$geoLocationCountryCode" },
538	{
539	$group: {
540	_id: "$geoLocationCountryCode",
541	count: { $sum: 1 },
542	//domain: { $addToSet: '$domain' },
543	numPagesInMRICount: { $sum: '$numPagesInMRI' },
544	numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
545	totalPagesAcrossSites: { $sum: '$totalPages'}
546	}
547	},
548	{ $sort : { count : -1} }
549	]);
550	*/
551	output = collection.aggregate(Arrays.asList(
552	//match(BasicDBObject.parse("{urlContainsLangCodeInPath:true}")),
553	unwind("$geoLocationCountryCode"),
554	group("$geoLocationCountryCode", Arrays.asList(
555	sum("count", 1),
556	/addToSet("domain", "$domain"),/
557	sum("numPagesInMRICount", "$numPagesInMRI"),
558	sum("numPagesContainingMRICount", "$numPagesContainingMRI"),
559	sum("totalPagesAcrossMatchingSites", "$totalPages"))),
560	sort(BasicDBObject.parse("{count : -1}"))
561	));
562	break;
563
564	case 2:
565	/*
566	db.Websites.aggregate([
567	{ $match: { numPagesInMRI: {$gt: 0} } },
568	{ $unwind: "$geoLocationCountryCode" },
569	{
570	$group: {
571	_id: {$toLower: '$geoLocationCountryCode'}, // ignore toLower: _id:'$geoLocationCountryCode'
572	count: { $sum: 1 },
573	//domain: { $addToSet: '$domain' },
574	numPagesInMRICount: { $sum: '$numPagesInMRI' },
575	numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
576	totalPagesAcrossSitesWithPositiveMRICount: { $sum: '$totalPages'}
577	}
578	},
579	{ $sort : { count : -1} }
580	]);
581	*/
582	output = collection.aggregate(Arrays.asList(
583	match(BasicDBObject.parse("{ numPagesInMRI: {$gt: 0} }")),
584	unwind("$geoLocationCountryCode"),
585	group("$geoLocationCountryCode", Arrays.asList(
586	sum("count", 1),
587	/addToSet("domain", "$domain"),/
588	sum("numPagesInMRICount", "$numPagesInMRI"),
589	sum("numPagesContainingMRICount", "$numPagesContainingMRI"),
590	sum("totalPagesAcrossMatchingSites", "$totalPages"))),
591	sort(BasicDBObject.parse("{count : -1}"))
592	));
593	break;
594
595	case 3:
596	/*
597	db.Websites.aggregate([
598	{
599	$match: { numPagesContainingMRI: {$gt: 0} }
600	},
601	{ $unwind: "$geoLocationCountryCode" },
602	{
603	$group: {
604	_id: '$geoLocationCountryCode',
605	count: { $sum: 1 },
606	//domain: { $addToSet: '$domain' },
607	numPagesInMRICount: { $sum: '$numPagesInMRI' },
608	numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
609	totalPagesAcrossSitesWithPosContainsMRI: { $sum: '$totalPages'}
610	}
611	},
612	{ $sort : { count : -1} }
613	]);
614	*/
615	output = collection.aggregate(Arrays.asList(
616	match(BasicDBObject.parse("{ numPagesInMRI: {$gt: 0} }")),
617	unwind("$geoLocationCountryCode"),
618	group("$geoLocationCountryCode", Arrays.asList(
619	sum("count", 1),
620	/addToSet("domain", "$domain"),/
621	sum("numPagesInMRICount", "$numPagesInMRI"),
622	sum("numPagesContainingMRICount", "$numPagesContainingMRI"),
623	sum("totalPagesAcrossMatchingSites", "$totalPages"))),
624	sort(BasicDBObject.parse("{count : -1}"))
625	));
626	break;
627
628	case 4:
629	/*
630	db.Websites.aggregate([
631	{
632	$match: {
633	$and: [
634	{numPagesContainingMRI: {$gt: 0}},
635	{$or: [{geoLocationCountryCode: /(NZ\|AU)/}, {domain: /\.nz$/}, {urlContainsLangCodeInPath: false}]}
636	]
637	}
638	},
639	{ $unwind: "$geoLocationCountryCode" },
640	{
641	$group: {
642	_id: {$toLower: '$geoLocationCountryCode'},
643	count: { $sum: 1 },
644	//domain: { $addToSet: '$domain' },
645	numPagesInMRICount: { $sum: '$numPagesInMRI' },
646	numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
647	totalPagesAcrossMatchingSites: { $sum: '$totalPages'}
648	}
649	},
650	{ $sort : { count : -1} }
651	]);
652	*/
653
654	orQuery = or(
655	BasicDBObject.parse("{geoLocationCountryCode: /(NZ\|AU)/}"),
656	//BasicDBObject.parse("{geoLocationCountryCode: \"NZ\"}"),
657	BasicDBObject.parse("{domain: /\\.nz$/}"),
658	BasicDBObject.parse("{urlContainsLangCodeInPath: false}")
659	);
660	andQuery = and(
661	BasicDBObject.parse("{numPagesContainingMRI: {$gt: 0}}"),
662	orQuery);
663	output = collection.aggregate(Arrays.asList(
664	match(andQuery),
665	unwind("$geoLocationCountryCode"),
666	group("$geoLocationCountryCode", Arrays.asList(
667	sum("count", 1),
668	/addToSet("domain", "$domain"),/
669	sum("numPagesInMRICount", "$numPagesInMRI"),
670	sum("numPagesContainingMRICount", "$numPagesContainingMRI"),
671	sum("totalPagesAcrossMatchingSites", "$totalPages"))),
672	sort(BasicDBObject.parse("{count : -1}"))
673	));
674	break;
675
676	case 5:
677	/*
678	Table of count by countryCode of sites with numPagesContainingMRI > 0
679	Just do OVERSEAS here, NZ handled separately
680
681	db.Websites.aggregate([
682	{
683	$match: {
684	$and: [
685	{geoLocationCountryCode: {$ne: "NZ"}},
686	{domain: {$not: /\.nz$/}},
687	{numPagesContainingMRI: {$gt: 0}}
688	]
689	}
690	},
691	{ $unwind: "$geoLocationCountryCode" },
692	{
693	$group: {
694	_id: '$geoLocationCountryCode',
695	count: { $sum: 1 },
696	//domain: { $addToSet: '$domain' },
697	numPagesInMRICount: { $sum: '$numPagesInMRI' },
698	numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
699	totalPagesAcrossMatchingSites: { $sum: '$totalPages'}
700	}
701	},
702	{ $sort : { count : -1} }
703	]);
704	*/
705
706	andQuery = and(
707	BasicDBObject.parse("{geoLocationCountryCode: {$ne: \"NZ\"}}"),
708	BasicDBObject.parse("{domain: {$not: /\\.nz$/}}"),
709	BasicDBObject.parse("{numPagesContainingMRI: {$gt: 0}}")
710	);
711	output = collection.aggregate(Arrays.asList(
712	match(andQuery),
713	unwind("$geoLocationCountryCode"),
714	group("$geoLocationCountryCode", Arrays.asList(
715	sum("count", 1),
716	/addToSet("domain", "$domain"),/
717	sum("numPagesInMRICount", "$numPagesInMRI"),
718	sum("numPagesContainingMRICount", "$numPagesContainingMRI"),
719	sum("totalPagesAcrossMatchingSites", "$totalPages"))),
720	sort(BasicDBObject.parse("{count : -1}"))
721	));
722
723	break;
724	default: logger.error("Unknown table number: " + tableNum);
725	}
726
727	return output;
728
729	}
730
731
732
733	/**
734	* called by lambda forEach() call on Document objects to write them out to a file.
735	* Have to deal with unreported exceptions here that can't be dealt with when doing
736	* the actual forEach(). See
737	* https://stackoverflow.com/questions/39090292/how-to-cleanly-deal-with-unreported-exception-ioexception-in-stream-foreach
738	*/
739	public void writeDoc(int docNum, Document doc, Writer writer) throws UncheckedIOException {
740
741	// If there's a domain field in the json Doc, sort this domain listing alphabetically
742	Object domainList = doc.remove("domain");
743	///logger.info("CLASS: " + domainList.getClass());
744	if(domainList != null) {
745	List sortedList = sortAlphabetically(domainList);
746	doc.put("uniqueCount", sortedList.size());
747	doc.put("domain", sortedList);
748	}
749
750	//OLD WAY: writer.write(doc.toJson(new JsonWriterSettings(JsonMode.STRICT, true)) + NEWLINE);
751	// Can't control json output to add newlines after each array element,
752	// no matter which JsonMode is used.
753
754	// https://mongodb.github.io/mongo-java-driver/3.9/javadoc/index.html?org/bson/json/JsonWriterSettings.html
755	// Still can't control array element output,
756	// but this way uses newer mongo java driver 3.9(.1). Tried its various JsonModes too:
757	//JsonWriterSettings writeSettings = new JsonWriterSettings();
758	//writeSettings.builder().outputMode(JsonMode.SHELL).indent(true).build();
759	//writer.write(doc.toJson(writeSettings) + NEWLINE);
760
761	// Not the JsonWriter of mongodb java driver:
762	// https://stackoverflow.com/questions/54746814/jsonwriter-add-a-new-line
763
764	// Have to use gson's pretty print to produce a json string that contains
765	// newlines after every array element in the json:
766
767	String jsonStr = prettyPrintJson(doc.toJson());
768	//System.err.println(jsonStr);
769	try {
770	writer.write("/* " + docNum + " */" + NEWLINE);
771	writer.write(jsonStr + NEWLINE + NEWLINE);
772	} catch (IOException ex) {
773	//throw ex;
774	throw new UncheckedIOException(ex);
775	}
776	}
777
778	public void writeDocAsJsonRecord(int docNum, Document doc, Writer writer) throws UncheckedIOException {
779	String jsonStr = prettyPrintJson(doc.toJson());
780	//System.err.println(jsonStr);
781	try {
782	writer.write("/* " + docNum + " */\n" + jsonStr + NEWLINE);
783	} catch (IOException ex) {
784	//throw ex;
785	throw new UncheckedIOException(ex);
786	}
787	}
788
789	// TODO
790	//public void writeDocToJsonAndCSV(int docNum, Document doc, Writer writer, CSVPrinter csvWriter) throws UncheckedIOException {
791	public void writeDocAsCSVRecord(int docNum, Document doc, CSVPrinter csvWriter) throws UncheckedIOException {
792	String jsonStr = doc.toJson();
793	JsonParser parser = new JsonParser();
794	JsonElement json = parser.parse(jsonStr);
795
796	JsonObject jsonObj = (JsonObject)json;
797
798	String countryCode = jsonObj.get("_id").getAsString();
799	int siteCount = jsonObj.get("count").getAsInt();
800	int numPagesInMRICount = jsonObj.get("numPagesInMRICount").getAsInt();
801	int numPagesContainingMRICount = jsonObj.get("numPagesContainingMRICount").getAsInt();
802	int totalPagesAcrossMatchingSites = jsonObj.get("totalPagesAcrossMatchingSites").getAsInt();
803
804	//System.err.println(jsonStr);
805	try {
806	//writer.write("/* " + docNum + " */\n" + prettyPrintJson(jsonStr) + NEWLINE);
807	csvWriter.printRecord(countryCode, siteCount, numPagesInMRICount, numPagesContainingMRICount, totalPagesAcrossMatchingSites);
808	} catch (IOException ex) {
809	//throw ex;
810	throw new UncheckedIOException(ex);
811	}
812	}
813
814	public String prettyPrintJson(String jsonStr) {
815	Gson gson = new GsonBuilder().setPrettyPrinting().create();
816	JsonParser jp = new JsonParser();
817	JsonElement je = jp.parse(jsonStr);
818	String prettyJsonString = gson.toJson(je);
819	return prettyJsonString;
820	}
821
822	private List sortAlphabetically(Object list) {
823	//BsonArray domainList = (BsonArray)list;
824	ArrayList<BsonValue> domainList = (ArrayList<BsonValue>)list;
825	/*
826	// for(String domain : domainList) {
827	for(int i = domainList.size() - 1; i >= 0; i--) {
828	BsonString domain = domainList.get(i).asString();
829	String domainStr = Utility.stripProtocolAndWWWFromURL(domain.toString());
830	domainList.set(i, new BsonString(domainStr));
831	}
832	Collections.sort(domainList);
833	// still need to get rid of non-unique values...
834	*/
835
836	TreeSet<String> set = new TreeSet<String>();
837	for(int i = domainList.size() - 1; i >= 0; i--) {
838	///BsonValue val = domainList.get(i);
839	///BsonString domain = val.asString();
840	//BsonString domain = domainList.get(i).asString();
841	//String domainStr = Utility.stripProtocolAndWWWFromURL(domain.toString());
842	Object domain = domainList.get(i);
843	String domainStr = Utility.stripProtocolAndWWWFromURL(domain.toString());
844	set.add(domainStr);
845	//domainList.set(i, new BsonString(domainStr));
846	}
847
848	domainList = new ArrayList<BsonValue>(); //new BsonArray();
849	for(String s : set) {
850	domainList.add(new BsonString(s));
851	}
852	return domainList;
853	}
854
855	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: