Changeset 33880 for other-projects


Ignore:
Timestamp:
2020-01-30T21:17:40+13:00 (4 years ago)
Author:
ak19
Message:

Write out the 5counts_tentativeNonAutotranslatedSites.json file with proper pretty printing: needed gson's capabilities to convert mongodb's document result into json string output, as this allows newlines between array elements whereas mongodb's JsonWriter didn't.

Location:
other-projects/maori-lang-detection/src/org/greenstone/atea
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java

    r33879 r33880  
    1717import static com.mongodb.client.model.Accumulators.*;
    1818
     19
    1920//import org.bson.conversions.Bson;
    2021import com.mongodb.BasicDBObject;
     
    2829import org.bson.Document;
    2930import org.bson.conversions.Bson;
     31import org.bson.json.JsonMode;
     32import org.bson.json.JsonWriterSettings;
    3033   
    3134import com.mongodb.util.JSON;
    3235//import com.mongodb.DBObject;
     36
     37
     38import com.google.gson.*; // for pretty printing
    3339
    3440import java.io.BufferedReader;
    3541import java.io.File;
    3642import java.io.FileReader;
     43import java.io.IOException;
     44import java.io.Writer;
     45
    3746import java.util.Arrays;
    3847import java.util.ArrayList;
     
    7988    public static final String WEBPAGES_COLLECTION = "Webpages";
    8089    public static final String WEBSITES_COLLECTION = "Websites";
     90
     91    public static final String NEWLINE = System.getProperty("line.separator");
    8192   
    8293    /** mongodb filter types to execute */
     
    351362    }
    352363
     364    /**       
     365       The mongodb aggregate() we want to run this time:
     366
     367       db.Websites.aggregate([
     368       {
     369        $match: {
     370            $and: [
     371                {numPagesContainingMRI: {$gt: 0}},
     372                {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz/}]}
     373            ]
     374          }
     375    },
     376    { $unwind: "$geoLocationCountryCode" },
     377    {
     378          $group: {
     379            _id: "nz",
     380            count: { $sum: 1 },
     381            domain: { $addToSet: '$domain' }
     382          }
     383    },
     384    { $sort : { count : -1} }
     385    ]);
     386    */
     387    public void aggregateContainsMRIForNZ(Writer writer) throws IOException {
     388    // working with the WebSites collection, not WebPages collection!
     389    MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);
     390
     391   
     392    //String isMRI_filter =
     393   
     394    Bson orQuery = or(
     395              BasicDBObject.parse("{geoLocationCountryCode: \"NZ\"}"),
     396              BasicDBObject.parse("{domain: /\\.nz/}")
     397              );
     398    Bson andQuery = and(
     399        BasicDBObject.parse("{numPagesContainingMRI: {$gt: 0}}"),
     400        orQuery);
     401   
     402    AggregateIterable<Document> output
     403        = collection.aggregate(Arrays.asList(
     404         match(andQuery),
     405         unwind("$geoLocationCountryCode"),
     406         group("NZ", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))),
     407         sort(BasicDBObject.parse("{count : -1}"))
     408     ));
     409
     410    // should only have one doc
     411    for (Document doc : output) {
     412        //System.out.println(doc);
     413        System.out.println(doc.toJson());
     414        // https://mongodb.github.io/mongo-java-driver/3.9/javadoc/index.html?org/bson/json/JsonWriterSettings.html
     415        //writer.write(doc.toJson(new JsonWriterSettings(JsonMode.STRICT, true/*indent*/)) + NEWLINE);
     416        /*
     417        JsonWriterSettings writeSettings = new JsonWriterSettings();
     418        writeSettings.builder().outputMode(JsonMode.SHELL).indent(true).build();       
     419        writer.write(doc.toJson(writeSettings) + NEWLINE);
     420        */
     421        writer.write(prettyPrintJson(doc.toJson()) + NEWLINE);
     422    }
     423
     424    return;
     425    }
     426   
    353427    /**
    354428       RUNNING A MONGODB COLLECTION.AGGREGATE() in JAVA:
     
    387461    ]);
    388462
     463
     464    https://stackoverflow.com/questions/54746814/jsonwriter-add-a-new-line
    389465    */
    390     public String aggregateContainsMRIForOverseas() {
     466    public void aggregateContainsMRIForOverseas(Writer writer) throws IOException {
    391467    // working with the WebSites collection, not WebPages collection!
    392468    MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);
     
    425501        //System.out.println(doc);
    426502        System.out.println(doc.toJson());
    427     }
    428 
    429     return "";
    430     }
    431 
    432 
    433     /**       
    434        The mongodb aggregate() we want to run this time:
    435 
    436        db.Websites.aggregate([
    437        {
    438         $match: {
    439             $and: [
    440                 {numPagesContainingMRI: {$gt: 0}},
    441                 {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz/}]}
    442             ]
    443           }
    444     },
    445     { $unwind: "$geoLocationCountryCode" },
    446     {
    447           $group: {
    448             _id: "nz",
    449             count: { $sum: 1 },
    450             domain: { $addToSet: '$domain' }
    451           }
    452     },
    453     { $sort : { count : -1} }
    454     ]);
    455     */
    456     public String aggregateContainsMRIForNZ() {
    457     // working with the WebSites collection, not WebPages collection!
    458     MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);
    459 
    460    
    461     //String isMRI_filter =
    462    
    463     Bson orQuery = or(
    464               BasicDBObject.parse("{geoLocationCountryCode: \"NZ\"}"),
    465               BasicDBObject.parse("{domain: /\\.nz/}")
    466               );
    467     Bson andQuery = and(
    468         BasicDBObject.parse("{numPagesContainingMRI: {$gt: 0}}"),
    469         orQuery);
    470    
    471     AggregateIterable<Document> output
    472         = collection.aggregate(Arrays.asList(
    473          match(andQuery),
    474          unwind("$geoLocationCountryCode"),
    475          group("NZ", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))),
    476          sort(BasicDBObject.parse("{count : -1}"))
    477      ));
    478 
    479     // should only have one doc
    480     for (Document doc : output) {
    481         //System.out.println(doc);
    482         System.out.println(doc.toJson());
    483     }
    484 
    485     return "";
    486     }
     503        // https://mongodb.github.io/mongo-java-driver/3.9/javadoc/index.html?org/bson/json/JsonWriterSettings.html
     504        //writer.write(doc.toJson(new JsonWriterSettings(JsonMode.STRICT, true/*indent*/)) + NEWLINE);
     505        /*
     506        JsonWriterSettings writeSettings = new JsonWriterSettings();
     507        writeSettings.builder().outputMode(JsonMode.SHELL).indent(true).build();
     508        writer.write(doc.toJson(writeSettings) + NEWLINE);
     509        */
     510        writer.write(prettyPrintJson(doc.toJson()) + NEWLINE);
     511    }
     512
     513    return;
     514    }
     515
     516
     517    public String prettyPrintJson(String jsonStr) {
     518    Gson gson = new GsonBuilder().setPrettyPrinting().create();
     519    JsonParser jp = new JsonParser();
     520    JsonElement je = jp.parse(jsonStr);
     521    String prettyJsonString = gson.toJson(je);
     522    return prettyJsonString;
     523    }
     524
    487525
    488526    public void writeToFile(boolean append, String filename, AggregateIterable<Document> output) {
  • other-projects/maori-lang-detection/src/org/greenstone/atea/WebPageURLsListing.java

    r33879 r33880  
    2727    private int numURLs;
    2828    private File domainsFile;
     29
     30
    2931   
    3032    public WebPageURLsListing(MongoDBAccess mongodbAccess,
     
    120122    return outFile.getAbsolutePath();
    121123    }
     124
     125    /* ---------------------------------------- */
     126
     127    /**
     128     * Create the file
     129     * @return full path of file generated
     130     */
     131    public String writeTentativeNonAutotranslatedSites() {
     132    File outFolder = new File("../mongodb-data/").getAbsoluteFile();
     133    File outFile = new File(outFolder, "5counts_tentativeNonAutotranslatedSites.json");
     134
     135    String filename = outFile.getAbsolutePath();
     136   
     137    try (
     138         Writer writer = new BufferedWriter(new FileWriter(outFile));
     139         ) {
     140        // first write out NZ sites and .nz TLD count and domains
     141        mongodbAccess.aggregateContainsMRIForNZ(writer);
     142        // next write out all overseas sites and .nz TLD count and domains
     143        mongodbAccess.aggregateContainsMRIForOverseas(writer);
     144
     145        filename = outFile.getCanonicalPath();
     146    } catch(Exception e) {
     147        logger.error("Unable to write to file " + outFile.getAbsolutePath());
     148        logger.error(e.getMessage(), e);
     149    }
     150
     151    return filename;
     152    }
    122153   
    123154   
     
    152183        //String isMRIFile = listing.produceURLsForPagesInMRI();
    153184        //String containsMRIFile = listing.produceURLsForPagesContainingMRI();
    154         mongodb.aggregateContainsMRIForNZ();
    155         mongodb.aggregateContainsMRIForOverseas();
     185        String filename = listing.writeTentativeNonAutotranslatedSites();
     186        System.err.println("Check file: " + filename);
     187
    156188       
    157189    } catch(Exception e) {
Note: See TracChangeset for help on using the changeset viewer.