Changeset 33885 for other-projects


Ignore:
Timestamp:
2020-01-31T22:54:15+13:00 (4 years ago)
Author:
ak19
Message:

Attempting to write the tables. csv not yet supported. Table 1 done.

Location:
other-projects/maori-lang-detection/src/org/greenstone/atea
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java

    r33883 r33885  
    3939
    4040import java.io.BufferedReader;
     41import java.io.BufferedWriter;
    4142import java.io.File;
    4243import java.io.FileReader;
     44import java.io.FileWriter;
    4345import java.io.IOException;
    4446import java.io.UncheckedIOException;
     
    512514    }
    513515
     516    /** Do the aggregates for writing out tables.
     517       Table1:
     518       db.Websites.aggregate([
     519       
     520       { $unwind: "$geoLocationCountryCode" },
     521       {
     522       $group: {
     523       _id: "$geoLocationCountryCode",
     524       count: { $sum: 1 },
     525       //domain: { $addToSet: '$domain' },
     526       numPagesInMRICount: { $sum: '$numPagesInMRI' },
     527       numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' }
     528       }
     529       },
     530       { $sort : { count : -1} }
     531       ]);
     532    */
     533    public void writeTables(File outFolder) {
     534    // In this function, we're always dealing with the Websites mongodb collection.
     535    MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);
     536
     537    // table 1
     538    File outFile = new File(outFolder, "1table_allCrawledSites.csv");
     539    try (
     540         Writer writer = new BufferedWriter(new FileWriter(outFile));
     541         ) {
     542       
     543       
     544        AggregateIterable<Document> output = collection.aggregate(Arrays.asList(
     545                           //match(BasicDBObject.parse("{urlContainsLangCodeInPath:true}")),
     546         unwind("$geoLocationCountryCode"),
     547         group("$geoLocationCountryCode", Arrays.asList(
     548                                sum("count", 1),
     549                                /*addToSet("domain", "$domain"),*/
     550                                sum("numPagesInMRICount", "$numPagesInMRI"),
     551                                sum("numPagesContainingMRICount", "$numPagesContainingMRI"))),
     552         sort(BasicDBObject.parse("{count : -1}"))
     553        ));//.forEach((Block<Document>)doc -> writeDoc(doc, writer));
     554
     555        int docNum = 0;
     556        for (Document doc : output) {       
     557        //System.out.println(doc);
     558        writeDoc(++docNum, doc, writer);       
     559        }
     560       
     561       
     562    } catch(UncheckedIOException ioe) {
     563        logger.error("Caught UncheckedIOException: " + ioe.getMessage(), ioe);
     564    }
     565    catch(Exception e) {
     566        logger.error("Could not write table to file " + outFile, e);
     567    }
     568    }
     569
     570    public void doTable1() {
     571   
     572    }
     573   
     574   
     575   
    514576    /**
    515577     * called by lambda forEach() call on Document objects to write them out to a file.
     
    517579     * the actual forEach(). See
    518580     * https://stackoverflow.com/questions/39090292/how-to-cleanly-deal-with-unreported-exception-ioexception-in-stream-foreach
    519      */
    520    
     581     */   
    521582    public void writeDoc(Document doc, Writer writer) throws UncheckedIOException {
    522583    //OLD WAY: writer.write(doc.toJson(new JsonWriterSettings(JsonMode.STRICT, true)) + NEWLINE);
     
    545606    }       
    546607    }
     608    public void writeDoc(int docNum, Document doc, Writer writer) throws UncheckedIOException {
     609    String jsonStr = prettyPrintJson(doc.toJson());
     610    //System.err.println(jsonStr);
     611    try {
     612        writer.write("/* " + docNum + " */\n" + jsonStr + NEWLINE);
     613    } catch (IOException ex) {
     614        //throw ex;
     615        throw new UncheckedIOException(ex);
     616    }
     617    }
     618   
     619   
    547620    public String prettyPrintJson(String jsonStr) {
    548621    Gson gson = new GsonBuilder().setPrettyPrinting().create();
  • other-projects/maori-lang-detection/src/org/greenstone/atea/WebPageURLsListing.java

    r33884 r33885  
    2020 * TO RUN:
    2121 *    maori-lang-detection/src$
     22 *       java -cp ".:../conf:../lib/*" org/greenstone/atea/WebPageURLsListing
     23 * OR:
    2224 *       java -cp ".:../conf:../lib/*" org/greenstone/atea/WebPageURLsListing ../mongodb-data/domainsNZ_IsMRI.txt
    2325 *
     
    344346
    345347        // TODO: generate the tables
     348
     349        mongodb.writeTables(outFolder);
    346350        }
    347351
Note: See TracChangeset for help on using the changeset viewer.