Changeset 33887 for other-projects


Ignore:
Timestamp:
2020-01-31T23:49:11+13:00 (4 years ago)
Author:
ak19
Message:
  1. Added support for writing out tables in csv format too. 2. Second table written out now. 3. Moved getFilePath() into Utility.
Location:
other-projects/maori-lang-detection/src/org/greenstone/atea
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java

    r33885 r33887  
    1717import static com.mongodb.client.model.Accumulators.*;
    1818
    19 
    2019//import org.bson.conversions.Bson;
    2120import com.mongodb.BasicDBObject;
     
    5756import org.greenstone.atea.morphia.*;
    5857import dev.morphia.*;
     58
     59import org.apache.commons.csv.*;
    5960
    6061/**
     
    8384 *   - https://www.mongodb.com/blog/post/6-rules-of-thumb-for-mongodb-schema-design-part-1
    8485 *
     86 * API:
     87 * - https://mongodb.github.io/mongo-java-driver/3.4/javadoc/?com/mongodb/client/MongoCollection.html#find--
     88 * - examples: https://mongodb.github.io/mongo-java-driver/3.4/driver/getting-started/quick-start/
    8589 */
    8690public class MongoDBAccess implements AutoCloseable {
     
    516520    /** Do the aggregates for writing out tables.
    517521       Table1:
    518        db.Websites.aggregate([
    519522       
    520        { $unwind: "$geoLocationCountryCode" },
    521        {
    522        $group: {
    523        _id: "$geoLocationCountryCode",
    524        count: { $sum: 1 },
    525        //domain: { $addToSet: '$domain' },
    526        numPagesInMRICount: { $sum: '$numPagesInMRI' },
    527        numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' }
    528        }
    529        },
    530        { $sort : { count : -1} }
    531        ]);
    532523    */
    533524    public void writeTables(File outFolder) {
     
    535526    MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);
    536527
    537     // table 1
    538     File outFile = new File(outFolder, "1table_allCrawledSites.csv");
    539     try (
    540          Writer writer = new BufferedWriter(new FileWriter(outFile));
    541          ) {
     528    String[] tableNames = { "", "1table_allCrawledSites", "2table_sitesWithPagesInMRI"};
     529    for (int tableNum = 1; tableNum < tableNames.length; tableNum++) {
     530        File outFile = new File(outFolder, tableNames[tableNum] + ".json");
     531        File csvFile = new File(outFolder, tableNames[tableNum] + ".csv");
     532        try (
     533         Writer writer = new BufferedWriter(new FileWriter(outFile));
     534         CSVPrinter csvWriter = new CSVPrinter(new FileWriter(csvFile), CSVFormat.DEFAULT);
     535         ) {
     536
     537        // Write out the CSV column headings
     538        // https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVPrinter.html
     539        csvWriter.printRecord("countryCode", "siteCount",
     540              "numPagesInMRI count","numPagesContainingMRICount"/*, "domain"*/);
     541       
     542        AggregateIterable<Document> output = getTable(collection, tableNum); //doTable1().forEach((Block<Document>)doc -> writeDoc(doc, writer));
     543       
     544        int docNum = 0;
     545        for (Document doc : output) {       
     546            //System.out.println(doc);
     547            writeDocAsJsonRecord(++docNum, doc, writer);
     548            writeDocAsCSVRecord(++docNum, doc, csvWriter);
     549        }       
     550        logger.info("@@@ Wrote out table into file: " + Utility.getFilePath(outFile) + " and .csv");
     551        } catch(UncheckedIOException ioe) {
     552        logger.error("Caught UncheckedIOException: " + ioe.getMessage(), ioe);
     553        }
     554        catch(Exception e) {
     555        logger.error("Could not write table to file " + outFile + " or .csv equivalent" , e);
     556        }
     557    }
     558    }
     559
     560    public AggregateIterable<Document> getTable(MongoCollection<Document> collection, int tableNum) {
     561   
     562    AggregateIterable<Document> output = null;
     563
     564    switch(tableNum) {
    542565       
    543        
    544         AggregateIterable<Document> output = collection.aggregate(Arrays.asList(
     566    case 1:
     567        /* 1table_allCrawledSites -
     568
     569           db.Websites.aggregate([       
     570           { $unwind: "$geoLocationCountryCode" },
     571           {
     572           $group: {
     573           _id: "$geoLocationCountryCode",
     574           count: { $sum: 1 },
     575           //domain: { $addToSet: '$domain' },
     576           numPagesInMRICount: { $sum: '$numPagesInMRI' },
     577           numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' }
     578           }
     579           },
     580           { $sort : { count : -1} }
     581           ]);
     582         */
     583        output = collection.aggregate(Arrays.asList(
    545584                           //match(BasicDBObject.parse("{urlContainsLangCodeInPath:true}")),
    546585         unwind("$geoLocationCountryCode"),
     
    550589                                sum("numPagesInMRICount", "$numPagesInMRI"),
    551590                                sum("numPagesContainingMRICount", "$numPagesContainingMRI"))),
    552          sort(BasicDBObject.parse("{count : -1}"))
    553         ));//.forEach((Block<Document>)doc -> writeDoc(doc, writer));
    554 
    555         int docNum = 0;
    556         for (Document doc : output) {       
    557         //System.out.println(doc);
    558         writeDoc(++docNum, doc, writer);       
    559         }
     591         sort(BasicDBObject.parse("{count : -1}"))       
     592        ));
     593        break;
     594
     595    case 2:
     596        /*
     597          db.Websites.aggregate([
     598          { $match: { numPagesInMRI: {$gt: 0} } },
     599          { $unwind: "$geoLocationCountryCode" },
     600          {
     601          $group: {
     602          _id: {$toLower: '$geoLocationCountryCode'}, // ignore toLower
     603          count: { $sum: 1 },
     604          //domain: { $addToSet: '$domain' },
     605          numPagesInMRICount: { $sum: '$numPagesInMRI' },
     606          numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' }
     607          }
     608          },
     609          { $sort : { count : -1} }
     610          ]);
     611         */
     612         output = collection.aggregate(Arrays.asList(
     613                           match(BasicDBObject.parse("{ numPagesInMRI: {$gt: 0} }")),
     614         unwind("$geoLocationCountryCode"),
     615         group("$geoLocationCountryCode", Arrays.asList(
     616                                sum("count", 1),
     617                                /*addToSet("domain", "$domain"),*/
     618                                sum("numPagesInMRICount", "$numPagesInMRI"),
     619                                sum("numPagesContainingMRICount", "$numPagesContainingMRI"))),
     620         sort(BasicDBObject.parse("{count : -1}"))       
     621        ));
     622        break;
    560623       
     624    default: logger.error("Unknown table number: " + tableNum);
    561625       
    562     } catch(UncheckedIOException ioe) {
    563         logger.error("Caught UncheckedIOException: " + ioe.getMessage(), ioe);
    564     }
    565     catch(Exception e) {
    566         logger.error("Could not write table to file " + outFile, e);
    567     }
    568     }
    569 
    570     public void doTable1() {
     626    }
     627
     628     return output;
    571629   
    572630    }
     
    606664    }       
    607665    }
    608     public void writeDoc(int docNum, Document doc, Writer writer) throws UncheckedIOException {
     666   
     667    public void writeDocAsJsonRecord(int docNum, Document doc, Writer writer) throws UncheckedIOException {
    609668    String jsonStr = prettyPrintJson(doc.toJson());
    610669    //System.err.println(jsonStr);
     
    616675    }
    617676    }
    618    
     677
     678    // TODO
     679    //public void writeDocToJsonAndCSV(int docNum, Document doc, Writer writer, CSVPrinter csvWriter) throws UncheckedIOException {
     680    public void writeDocAsCSVRecord(int docNum, Document doc, CSVPrinter csvWriter) throws UncheckedIOException {
     681    String jsonStr = doc.toJson();
     682    JsonParser parser = new JsonParser();
     683    JsonElement json = parser.parse(jsonStr);
     684
     685    JsonObject jsonObj = (JsonObject)json;
     686   
     687    String countryCode = jsonObj.get("_id").getAsString();
     688    int siteCount = jsonObj.get("count").getAsInt();
     689    int numPagesInMRICount = jsonObj.get("numPagesInMRICount").getAsInt();
     690    int numPagesContainingMRICount = jsonObj.get("numPagesContainingMRICount").getAsInt(); 
     691   
     692    //System.err.println(jsonStr);
     693    try {
     694        //writer.write("/* " + docNum + " */\n" + prettyPrintJson(jsonStr) + NEWLINE);
     695        csvWriter.printRecord(countryCode, siteCount, numPagesInMRICount, numPagesContainingMRICount);
     696    } catch (IOException ex) {
     697        //throw ex;
     698        throw new UncheckedIOException(ex);
     699    }
     700    }
    619701   
    620702    public String prettyPrintJson(String jsonStr) {
  • other-projects/maori-lang-detection/src/org/greenstone/atea/Utility.java

    r33666 r33887  
    1111public class Utility {
    1212    private static Logger logger = Logger.getLogger(org.greenstone.atea.Utility.class.getName());
     13   
     14    public static String getFilePath(File file) {
     15    try {
     16        return file.getCanonicalPath();
     17    } catch(IOException e) {
     18        return file.getAbsolutePath();
     19    }
     20    }
    1321   
    1422    // Run gunzip
  • other-projects/maori-lang-detection/src/org/greenstone/atea/WebPageURLsListing.java

    r33885 r33887  
    5555    }
    5656
    57     private String getFilePath(File file) {
    58     try {
    59         return file.getCanonicalPath();
    60     } catch(IOException e) {
    61         return file.getAbsolutePath();
    62     }
    63     }
    6457   
    6558    public void produceURLsForPagesInMRI(File domainsFile) {
     
    6962   
    7063    System.out.println("Wrote all isMRI web page URLs for the sites in input domainsFile\ninto file: "
    71                + getFilePath(outFile));
     64               + Utility.getFilePath(outFile));
    7265    }
    7366   
     
    7871
    7972    System.out.println("Wrote all containsMRI web page URLs for the sites in input domainsFile\ninto file: "
    80                + getFilePath(outFile));
     73               + Utility.getFilePath(outFile));
    8174    }
    8275   
     
    120113        System.err.println("");
    121114    } catch(Exception e) {
    122         logger.error("Unable to read URLs from file " + getFilePath(domainsFile));
     115        logger.error("Unable to read URLs from file " + Utility.getFilePath(domainsFile));
    123116        logger.error(e.getMessage(), e);
    124117    }
     
    159152    writeURLsToFile(urlsList, outFile, N_totalNumPages);
    160153    System.out.println("Wrote out full listing of web page URLs for sites in input domainsFile"
    161                + "\ninto file: " + getFilePath(outFile));
     154               + "\ninto file: " + Utility.getFilePath(outFile));
    162155       
    163156    // 3. calculate sample size n for population size N if using 90% confidence and 5% margin of error
     
    175168    writeURLsToFile(urlsList, outFile, n_numSampleURLs);
    176169    System.out.println("Wrote a sample of n=" + n_numSampleURLs + " of web page URLs "
    177                + "for the sites in input domainsFile\ninto file: " + getFilePath(outFile));
     170               + "for the sites in input domainsFile\ninto file: " + Utility.getFilePath(outFile));
    178171    }
    179172
     
    219212        }
    220213    } catch(Exception e) {
    221         logger.error("Unable to write to file " + getFilePath(outFile));
     214        logger.error("Unable to write to file " + Utility.getFilePath(outFile));
    222215        logger.error(e.getMessage(), e);
    223216    }
     
    237230    File outFile = new File(outFolder, "5a_counts_tentativeNonAutotranslatedSites.json");
    238231
    239     String filename = getFilePath(outFile);
     232    String filename = Utility.getFilePath(outFile);
    240233   
    241234    try (
     
    268261    File outFile = new File(outFolder, "5b_counts_overseasSitesWithMiInPath.json");
    269262
    270     String filename = getFilePath(outFile);
     263    String filename = Utility.getFilePath(outFile);
    271264    try (
    272265         Writer writer = new BufferedWriter(new FileWriter(outFile));
Note: See TracChangeset for help on using the changeset viewer.