Changeset 33880

Show
Ignore:
Timestamp:
30.01.2020 21:17:40 (3 weeks ago)
Author:
ak19
Message:

Write out the 5counts_tentativeNonAutotranslatedSites.json file with proper pretty printing: needed gson's capabilities to convert mongodb's document result into json string output, as this allows newlines between array elements whereas mongodb's JsonWriter? didn't.

Location:
other-projects/maori-lang-detection/src/org/greenstone/atea
Files:
2 modified

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java

    r33879 r33880  
    1717import static com.mongodb.client.model.Accumulators.*; 
    1818 
     19 
    1920//import org.bson.conversions.Bson; 
    2021import com.mongodb.BasicDBObject; 
     
    2829import org.bson.Document; 
    2930import org.bson.conversions.Bson; 
     31import org.bson.json.JsonMode; 
     32import org.bson.json.JsonWriterSettings; 
    3033     
    3134import com.mongodb.util.JSON; 
    3235//import com.mongodb.DBObject; 
     36 
     37 
     38import com.google.gson.*; // for pretty printing 
    3339 
    3440import java.io.BufferedReader; 
    3541import java.io.File; 
    3642import java.io.FileReader; 
     43import java.io.IOException; 
     44import java.io.Writer; 
     45 
    3746import java.util.Arrays; 
    3847import java.util.ArrayList; 
     
    7988    public static final String WEBPAGES_COLLECTION = "Webpages"; 
    8089    public static final String WEBSITES_COLLECTION = "Websites"; 
     90 
     91    public static final String NEWLINE = System.getProperty("line.separator"); 
    8192     
    8293    /** mongodb filter types to execute */ 
     
    351362    } 
    352363 
     364    /**        
     365       The mongodb aggregate() we want to run this time: 
     366 
     367       db.Websites.aggregate([ 
     368       { 
     369        $match: { 
     370            $and: [ 
     371                {numPagesContainingMRI: {$gt: 0}}, 
     372                {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz/}]} 
     373            ] 
     374          } 
     375    }, 
     376    { $unwind: "$geoLocationCountryCode" }, 
     377    { 
     378          $group: { 
     379            _id: "nz", 
     380            count: { $sum: 1 }, 
     381            domain: { $addToSet: '$domain' } 
     382          } 
     383    }, 
     384    { $sort : { count : -1} } 
     385    ]); 
     386    */ 
     387    public void aggregateContainsMRIForNZ(Writer writer) throws IOException { 
     388    // working with the WebSites collection, not WebPages collection! 
     389    MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION); 
     390 
     391     
     392    //String isMRI_filter =  
     393     
     394    Bson orQuery = or( 
     395              BasicDBObject.parse("{geoLocationCountryCode: \"NZ\"}"), 
     396              BasicDBObject.parse("{domain: /\\.nz/}") 
     397              ); 
     398    Bson andQuery = and( 
     399        BasicDBObject.parse("{numPagesContainingMRI: {$gt: 0}}"), 
     400        orQuery); 
     401     
     402    AggregateIterable<Document> output 
     403        = collection.aggregate(Arrays.asList( 
     404         match(andQuery), 
     405         unwind("$geoLocationCountryCode"), 
     406         group("NZ", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))), 
     407         sort(BasicDBObject.parse("{count : -1}")) 
     408     )); 
     409 
     410    // should only have one doc 
     411    for (Document doc : output) { 
     412        //System.out.println(doc); 
     413        System.out.println(doc.toJson()); 
     414        // https://mongodb.github.io/mongo-java-driver/3.9/javadoc/index.html?org/bson/json/JsonWriterSettings.html 
     415        //writer.write(doc.toJson(new JsonWriterSettings(JsonMode.STRICT, true/*indent*/)) + NEWLINE); 
     416        /* 
     417        JsonWriterSettings writeSettings = new JsonWriterSettings(); 
     418        writeSettings.builder().outputMode(JsonMode.SHELL).indent(true).build();         
     419        writer.write(doc.toJson(writeSettings) + NEWLINE); 
     420        */ 
     421        writer.write(prettyPrintJson(doc.toJson()) + NEWLINE); 
     422    } 
     423 
     424    return; 
     425    } 
     426     
    353427    /** 
    354428       RUNNING A MONGODB COLLECTION.AGGREGATE() in JAVA: 
     
    387461    ]); 
    388462 
     463 
     464    https://stackoverflow.com/questions/54746814/jsonwriter-add-a-new-line 
    389465    */ 
    390     public String aggregateContainsMRIForOverseas() { 
     466    public void aggregateContainsMRIForOverseas(Writer writer) throws IOException { 
    391467    // working with the WebSites collection, not WebPages collection! 
    392468    MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION); 
     
    425501        //System.out.println(doc); 
    426502        System.out.println(doc.toJson()); 
    427     } 
    428  
    429     return ""; 
    430     } 
    431  
    432  
    433     /**        
    434        The mongodb aggregate() we want to run this time: 
    435  
    436        db.Websites.aggregate([ 
    437        { 
    438         $match: { 
    439             $and: [ 
    440                 {numPagesContainingMRI: {$gt: 0}}, 
    441                 {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz/}]} 
    442             ] 
    443           } 
    444     }, 
    445     { $unwind: "$geoLocationCountryCode" }, 
    446     { 
    447           $group: { 
    448             _id: "nz", 
    449             count: { $sum: 1 }, 
    450             domain: { $addToSet: '$domain' } 
    451           } 
    452     }, 
    453     { $sort : { count : -1} } 
    454     ]); 
    455     */ 
    456     public String aggregateContainsMRIForNZ() { 
    457     // working with the WebSites collection, not WebPages collection! 
    458     MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION); 
    459  
    460      
    461     //String isMRI_filter =  
    462      
    463     Bson orQuery = or( 
    464               BasicDBObject.parse("{geoLocationCountryCode: \"NZ\"}"), 
    465               BasicDBObject.parse("{domain: /\\.nz/}") 
    466               ); 
    467     Bson andQuery = and( 
    468         BasicDBObject.parse("{numPagesContainingMRI: {$gt: 0}}"), 
    469         orQuery); 
    470      
    471     AggregateIterable<Document> output 
    472         = collection.aggregate(Arrays.asList( 
    473          match(andQuery), 
    474          unwind("$geoLocationCountryCode"), 
    475          group("NZ", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))), 
    476          sort(BasicDBObject.parse("{count : -1}")) 
    477      )); 
    478  
    479     // should only have one doc 
    480     for (Document doc : output) { 
    481         //System.out.println(doc); 
    482         System.out.println(doc.toJson()); 
    483     } 
    484  
    485     return ""; 
    486     } 
     503        // https://mongodb.github.io/mongo-java-driver/3.9/javadoc/index.html?org/bson/json/JsonWriterSettings.html 
     504        //writer.write(doc.toJson(new JsonWriterSettings(JsonMode.STRICT, true/*indent*/)) + NEWLINE); 
     505        /* 
     506        JsonWriterSettings writeSettings = new JsonWriterSettings(); 
     507        writeSettings.builder().outputMode(JsonMode.SHELL).indent(true).build(); 
     508        writer.write(doc.toJson(writeSettings) + NEWLINE); 
     509        */ 
     510        writer.write(prettyPrintJson(doc.toJson()) + NEWLINE); 
     511    } 
     512 
     513    return; 
     514    } 
     515 
     516 
     517    public String prettyPrintJson(String jsonStr) { 
     518    Gson gson = new GsonBuilder().setPrettyPrinting().create(); 
     519    JsonParser jp = new JsonParser(); 
     520    JsonElement je = jp.parse(jsonStr); 
     521    String prettyJsonString = gson.toJson(je); 
     522    return prettyJsonString; 
     523    } 
     524 
    487525 
    488526    public void writeToFile(boolean append, String filename, AggregateIterable<Document> output) { 
  • other-projects/maori-lang-detection/src/org/greenstone/atea/WebPageURLsListing.java

    r33879 r33880  
    2727    private int numURLs; 
    2828    private File domainsFile; 
     29 
     30 
    2931     
    3032    public WebPageURLsListing(MongoDBAccess mongodbAccess, 
     
    120122    return outFile.getAbsolutePath(); 
    121123    } 
     124 
     125    /* ---------------------------------------- */ 
     126 
     127    /**  
     128     * Create the file  
     129     * @return full path of file generated 
     130     */ 
     131    public String writeTentativeNonAutotranslatedSites() { 
     132    File outFolder = new File("../mongodb-data/").getAbsoluteFile(); 
     133    File outFile = new File(outFolder, "5counts_tentativeNonAutotranslatedSites.json"); 
     134 
     135    String filename = outFile.getAbsolutePath(); 
     136     
     137    try ( 
     138         Writer writer = new BufferedWriter(new FileWriter(outFile)); 
     139         ) { 
     140        // first write out NZ sites and .nz TLD count and domains 
     141        mongodbAccess.aggregateContainsMRIForNZ(writer); 
     142        // next write out all overseas sites and .nz TLD count and domains 
     143        mongodbAccess.aggregateContainsMRIForOverseas(writer); 
     144 
     145        filename = outFile.getCanonicalPath(); 
     146    } catch(Exception e) { 
     147        logger.error("Unable to write to file " + outFile.getAbsolutePath()); 
     148        logger.error(e.getMessage(), e); 
     149    } 
     150 
     151    return filename; 
     152    } 
    122153     
    123154     
     
    152183        //String isMRIFile = listing.produceURLsForPagesInMRI(); 
    153184        //String containsMRIFile = listing.produceURLsForPagesContainingMRI(); 
    154         mongodb.aggregateContainsMRIForNZ(); 
    155         mongodb.aggregateContainsMRIForOverseas(); 
     185        String filename = listing.writeTentativeNonAutotranslatedSites(); 
     186        System.err.println("Check file: " + filename); 
     187 
    156188         
    157189    } catch(Exception e) {