Changeset 33885

Show
Ignore:
Timestamp:
31.01.2020 22:54:15 (3 weeks ago)
Author:
ak19
Message:

Attempting to write the tables. csv not yet supported. Table 1 done.

Location:
other-projects/maori-lang-detection/src/org/greenstone/atea
Files:
2 modified

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java

    r33883 r33885  
    3939 
    4040import java.io.BufferedReader; 
     41import java.io.BufferedWriter; 
    4142import java.io.File; 
    4243import java.io.FileReader; 
     44import java.io.FileWriter; 
    4345import java.io.IOException; 
    4446import java.io.UncheckedIOException; 
     
    512514    } 
    513515 
     516    /** Do the aggregates for writing out tables. 
     517       Table1: 
     518       db.Websites.aggregate([ 
     519        
     520       { $unwind: "$geoLocationCountryCode" }, 
     521       { 
     522       $group: { 
     523       _id: "$geoLocationCountryCode", 
     524       count: { $sum: 1 }, 
     525       //domain: { $addToSet: '$domain' }, 
     526       numPagesInMRICount: { $sum: '$numPagesInMRI' }, 
     527       numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' } 
     528       } 
     529       }, 
     530       { $sort : { count : -1} } 
     531       ]); 
     532    */ 
     533    public void writeTables(File outFolder) { 
     534    // In this function, we're always dealing with the Websites mongodb collection. 
     535    MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION); 
     536 
     537    // table 1 
     538    File outFile = new File(outFolder, "1table_allCrawledSites.csv"); 
     539    try ( 
     540         Writer writer = new BufferedWriter(new FileWriter(outFile)); 
     541         ) { 
     542         
     543         
     544        AggregateIterable<Document> output = collection.aggregate(Arrays.asList( 
     545                           //match(BasicDBObject.parse("{urlContainsLangCodeInPath:true}")), 
     546         unwind("$geoLocationCountryCode"), 
     547         group("$geoLocationCountryCode", Arrays.asList( 
     548                                sum("count", 1), 
     549                                /*addToSet("domain", "$domain"),*/ 
     550                                sum("numPagesInMRICount", "$numPagesInMRI"), 
     551                                sum("numPagesContainingMRICount", "$numPagesContainingMRI"))), 
     552         sort(BasicDBObject.parse("{count : -1}")) 
     553        ));//.forEach((Block<Document>)doc -> writeDoc(doc, writer)); 
     554 
     555        int docNum = 0; 
     556        for (Document doc : output) {        
     557        //System.out.println(doc); 
     558        writeDoc(++docNum, doc, writer);         
     559        } 
     560         
     561         
     562    } catch(UncheckedIOException ioe) { 
     563        logger.error("Caught UncheckedIOException: " + ioe.getMessage(), ioe); 
     564    } 
     565    catch(Exception e) { 
     566        logger.error("Could not write table to file " + outFile, e); 
     567    } 
     568    } 
     569 
     570    public void doTable1() { 
     571     
     572    } 
     573     
     574     
     575     
    514576    /** 
    515577     * called by lambda forEach() call on Document objects to write them out to a file. 
     
    517579     * the actual forEach(). See 
    518580     * https://stackoverflow.com/questions/39090292/how-to-cleanly-deal-with-unreported-exception-ioexception-in-stream-foreach 
    519      */ 
    520      
     581     */     
    521582    public void writeDoc(Document doc, Writer writer) throws UncheckedIOException { 
    522583    //OLD WAY: writer.write(doc.toJson(new JsonWriterSettings(JsonMode.STRICT, true)) + NEWLINE); 
     
    545606    }        
    546607    } 
     608    public void writeDoc(int docNum, Document doc, Writer writer) throws UncheckedIOException { 
     609    String jsonStr = prettyPrintJson(doc.toJson()); 
     610    //System.err.println(jsonStr); 
     611    try { 
     612        writer.write("/* " + docNum + " */\n" + jsonStr + NEWLINE); 
     613    } catch (IOException ex) { 
     614        //throw ex; 
     615        throw new UncheckedIOException(ex); 
     616    } 
     617    } 
     618     
     619     
    547620    public String prettyPrintJson(String jsonStr) { 
    548621    Gson gson = new GsonBuilder().setPrettyPrinting().create(); 
  • other-projects/maori-lang-detection/src/org/greenstone/atea/WebPageURLsListing.java

    r33884 r33885  
    2020 * TO RUN: 
    2121 *    maori-lang-detection/src$ 
     22 *       java -cp ".:../conf:../lib/*" org/greenstone/atea/WebPageURLsListing 
     23 * OR: 
    2224 *       java -cp ".:../conf:../lib/*" org/greenstone/atea/WebPageURLsListing ../mongodb-data/domainsNZ_IsMRI.txt 
    2325 * 
     
    344346 
    345347        // TODO: generate the tables 
     348 
     349        mongodb.writeTables(outFolder); 
    346350        } 
    347351