Changeset 33909 for other-projects


Ignore:
Timestamp:
2020-02-12T19:02:44+13:00 (4 years ago)
Author:
ak19
Message:
  1. Implementing tables 3 to 5. 2. Rolled back the introduction of the basicDomain field (domain stripped of http/https and www prefixes) as the code can create and sort this field alphabetically, whereas it didn't sort properly in mongodb. 3. The code now does sort the domains stripped of protocol and www for the mongodb queries producing domain results and ensures the domain list is unique. 4. Split the MongoDBAccess class into 2, with the connection code in MongoDBAccess.java and the querying code in MongoDBQueryer (a subclass of MongoDBAccess) that is so far exclusively used by WebPageURLsListing.java
Location:
other-projects/maori-lang-detection/src/org/greenstone/atea
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java

    r33906 r33909  
    2828import org.bson.BsonArray;
    2929import org.bson.BsonString;
     30import org.bson.BsonValue;
    3031import org.bson.Document;
    3132import org.bson.conversions.Bson;
     
    5253import java.util.List;
    5354import java.util.Properties;
     55import java.util.TreeSet;
    5456import java.util.regex.Pattern;
    5557
     
    98100    public static final String WEBSITES_COLLECTION = "Websites";
    99101
    100     public static final String NEWLINE = System.getProperty("line.separator");
    101    
    102     /** mongodb filter types to execute */
    103     public static final int IS_MRI = 0;
    104     public static final int CONTAINS_MRI = 1;
    105 
    106     /** Some reused fieldnames in the Websites collection */
    107     private static final String FILTER_NUMPAGES_IN_MRI = "numPagesInMRI";
    108     private static final String FILTER_NUMPAGES_CONTAINING_MRI = "numPagesContainingMRI";
    109 
     102   
    110103    // configuration details, some with fallback values
    111     private String HOST = "localhost";
    112     private int PORT = 27017; // mongodb port
    113     private String USERNAME;
    114     private String PASSWORD;   
    115     private String DB_NAME ="ateacrawldata";
    116    
    117     private MongoClient mongo = null;
    118     private MongoDatabase database = null;
     104    protected String HOST = "localhost";
     105    protected int PORT = 27017; // mongodb port
     106    protected String USERNAME;
     107    protected String PASSWORD;   
     108    protected String DB_NAME ="ateacrawldata";
     109   
     110    protected MongoClient mongo = null;
     111    protected MongoDatabase database = null;
    119112
    120113    /**
     
    210203        System.err.println("coll: " + coll);
    211204    }
     205    }
     206
     207    protected MongoCollection<Document> getWebpagesCollection() {
     208    return this.database.getCollection(WEBPAGES_COLLECTION);
     209    }
     210    protected MongoCollection<Document> getWebsitesCollection() {
     211    return this.database.getCollection(WEBSITES_COLLECTION);
    212212    }
    213213   
     
    219219        .append("siteFolderName", website.siteFolderName)
    220220        .append("domain", website.domain)
    221         .append("basicDomain", website.basicDomain)
    222221        .append("totalPages", website.totalPages)
    223222        .append("numPagesWithBodyText", website.countOfWebPagesWithBodyText)
     
    303302    }
    304303    */
    305    
    306     public ArrayList<String> queryAllMatchingIsMRIURLs(String domain) {
    307     return queryAllMatchingURLsFilteredBy(domain, IS_MRI);
    308     }
    309     public ArrayList<String> queryAllMatchingcontainsMRIURLs(String domain) {
    310     return queryAllMatchingURLsFilteredBy(domain, CONTAINS_MRI);
    311     }
    312    
    313     /**
    314      * Java mongodb find: https://mongodb.github.io/mongo-java-driver/3.4/driver/getting-started/quick-start/
    315      * Java mongodb find filters: https://mongodb.github.io/mongo-java-driver/3.4/javadoc/?com/mongodb/client/model/Filters.html
    316      * Java mongodb projection: https://stackoverflow.com/questions/44894497/retrieving-data-with-mongodb-java-driver-3-4-using-find-method-with-projection
    317      * mongodb projection: https://docs.mongodb.com/v3.2/reference/method/db.collection.find/#db.collection.find
    318      *
    319      * Parse MongoDB query into Java: https://stackoverflow.com/questions/17326747/parsing-strings-to-mongodb-query-documents-with-operators-in-java
    320      * Maybe also https://stackoverflow.com/questions/48000891/parse-mongodb-json-query-in-java-with-multiple-criteria
    321      * https://stackoverflow.com/questions/55029222/parse-mongodb-query-to-java
    322      * http://pingax.com/trick-convert-mongo-shell-query-equivalent-java-objects/
    323      */
    324     public ArrayList<String> queryAllMatchingURLsFilteredBy(String domain, int filterType) {
    325    
    326     final ArrayList<String> urlsList = new ArrayList<String>();
    327    
    328     // remove any http(s)://(www.) from the start of URL first
    329     // since it goes into a regex
    330     domain = Utility.stripProtocolAndWWWFromURL(domain);
    331 
    332     // load the "webpages" db table
    333     // in mongodb, the equivalent of db tables are called 'collections'
    334     MongoCollection<Document> collection = this.database.getCollection(WEBPAGES_COLLECTION);
    335 
    336     // code we'll execute in Iterable.forEach() below
    337     // see also https://www.baeldung.com/foreach-java
    338     Block<Document> storeURL = new Block<Document>() {
    339         @Override
    340         public void apply(final Document document) {
    341             //System.out.println(document.toJson());
    342             String url = document.getString("URL");
    343             // add to our urlsList
    344             //System.out.println(url);
    345             urlsList.add(url);
    346         }
    347         };
    348 
    349    
    350     // Run the following mongodb query:
    351     //    db.getCollection('Webpages').find({URL: /domain/, isMRI: true}, {URL: 1, _id: 0})
    352    
    353     // 1. One way that works:
    354     //collection.find(and(eq("isMRI", true), regex("URL", pattern))).projection(fields(include("URL"), excludeId())).forEach(storeURL);
    355 
    356     // 2. Another way:
    357     //String query = "{URL: /DOMAIN/, isMRI: true}";
    358     String query = "{URL: /DOMAIN/, ";
    359     if(filterType == IS_MRI) {
    360         query += "isMRI: true}";
    361     } else if(filterType == CONTAINS_MRI) {
    362         query += "containsMRI: true}";
    363     }
    364    
    365     domain = domain.replace(".", "\\."); // escape dots in domain for regex
    366     query = query.replace("DOMAIN", domain);
    367 
    368     //System.err.println("Executing find query: " + query);
    369    
    370     BasicDBObject findObj = BasicDBObject.parse(query);
    371     BasicDBObject projectionObj = BasicDBObject.parse("{URL: 1, _id: 0}");
    372    
    373 
    374     collection.find(findObj).projection(projectionObj).forEach(storeURL);
    375    
    376     return urlsList;
    377     }
    378 
    379     /**     
    380      * RUNNING A MONGODB COLLECTION.AGGREGATE() in JAVA:
    381      *
    382      * https://stackoverflow.com/questions/31643109/mongodb-aggregation-with-java-driver
    383      * https://stackoverflow.com/questions/48000891/parse-mongodb-json-query-in-java-with-multiple-criteria
    384      * Not Java: https://stackoverflow.com/questions/39060221/a-pipeline-stage-specification-object-must-contain-exactly-one-field-with-php-mo
    385      *
    386      * (https://stackoverflow.com/questions/55029222/parse-mongodb-query-to-java)
    387      * https://www.programcreek.com/java-api-examples/?api=com.mongodb.client.model.Aggregates
    388      * On using group(TExpression) inside collection.aggregate().
    389      *
    390      *  For forEach lamba expressions, see also https://www.baeldung.com/foreach-java
    391      *  and https://www.javatpoint.com/java-8-foreach
    392      *  and https://stackoverflow.com/questions/47979978/ambiguous-reference-to-foreach-when-listing-mongodbs-database-in-java
    393      *
    394      * Count by country code of non-NZ websites containing a positive number of sentences in MRI,
    395      * listing all the base domain strings (no protocol or www) in ALPHABETICAL ORDER
    396      * and total counts of numPagesInMRI and numPagesContainingMRI across all these
    397      * matching sites.
    398      *
    399      * The mongodb aggregate() we want to run this time:
    400      *
    401        db.Websites.aggregate([
    402        {
    403         $match: {
    404             $and: [
    405                 {numPagesContainingMRI: {$gt: 0}},
    406                 {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz/}]}
    407             ]
    408           }
    409     },
    410     { $unwind: "$geoLocationCountryCode" },
    411     {
    412           $group: {
    413             _id: "nz",
    414             count: { $sum: 1 },           
    415         domain: { $addToSet: '$basicDomain' } // domain: {$push: "$basicDomain" }
    416           }
    417     },
    418     { $sort : { count : -1} }
    419     ]);
    420     */
    421     public void aggregateContainsMRIForNZ(Writer writer, int filterType) throws IOException {
    422     // working with the WebSites collection, not WebPages collection!
    423     MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);
    424 
    425     String mriFilterString = (filterType == CONTAINS_MRI) ? "{numPagesContainingMRI: {$gt: 0}}" : "{numPagesInMRI: {$gt: 0}}";
    426    
    427     Bson orQuery = or(
    428               BasicDBObject.parse("{geoLocationCountryCode: \"NZ\"}"),
    429               BasicDBObject.parse("{domain: /\\.nz/}")
    430               );
    431     Bson andQuery = and(
    432         BasicDBObject.parse(mriFilterString),
    433         orQuery);
    434    
    435     // Hopefully the lambda expression (forEach()) at end means
    436     // we write out each result Document as we get it
    437     collection.aggregate(Arrays.asList(
    438          match(andQuery),
    439          unwind("$geoLocationCountryCode"),
    440          group("NZ", Arrays.asList(sum("count", 1),
    441                    addToSet("domain", "$basicDomain"))),
    442          sort(BasicDBObject.parse("{count : -1}"))
    443      )).forEach((Block<Document>)doc -> writeDoc(doc, writer));
    444 
    445     // should only have one doc for NZ since it's a count by geolocation.
    446 
    447     return;
    448     }
    449    
    450     /**
    451      * Count of NZ (incl .nz TLD)  websites containing a positive number of sentences in MRI,
    452      * listing all the base domain strings (no protocol or www) in ALPHABETICAL ORDER
    453      * and total counts of numPagesInMRI and numPagesContainingMRI across all these
    454      * matching sites.
    455      *
    456      * The aggregate() we want to run this time:
    457      *
    458        db.Websites.aggregate([
    459        {
    460          $match: {
    461             $and: [
    462                 {geoLocationCountryCode: {$ne: "NZ"}},
    463                 {domain: {$not: /\.nz/}},
    464                 {numPagesContainingMRI: {$gt: 0}},
    465                 {$or: [{geoLocationCountryCode: "AU"}, {urlContainsLangCodeInPath: false}]}           
    466             ]
    467       }
    468     },
    469     { $unwind: "$geoLocationCountryCode" },
    470     {
    471           $group: {
    472             _id: {$toLower: '$geoLocationCountryCode'},
    473             count: { $sum: 1 },
    474         domain: { $addToSet: '$basicDomain' } // domain: {$push: "$basicDomain" }
    475           }
    476      },
    477      { $sort : { count : -1} }
    478     ]);
    479     */
    480     public void aggregateContainsMRIForOverseas(Writer writer, int filterType,
    481                         boolean isMiInURLPath) throws UncheckedIOException
    482     {
    483     // working with the WebSites collection, not WebPages collection!
    484     MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);
    485    
    486     String mriFilterString = (filterType == CONTAINS_MRI) ? "{numPagesContainingMRI: {$gt: 0}}" : "{numPagesInMRI: {$gt: 0}}";
    487    
    488     Bson orQuery = or(
    489               BasicDBObject.parse("{geoLocationCountryCode: \"AU\"}"),
    490               BasicDBObject.parse("{urlContainsLangCodeInPath: "+ isMiInURLPath +"}")
    491               // e.g. "{urlContainsLangCodeInPath: false}"
    492               );
    493     Bson andQuery = and(
    494         BasicDBObject.parse("{geoLocationCountryCode: {$ne: \"NZ\"}}"),
    495         BasicDBObject.parse("{domain: {$not: /\\.nz/}}"),
    496         BasicDBObject.parse(mriFilterString),
    497         orQuery);
    498 
    499     collection.aggregate(Arrays.asList(
    500          match(andQuery),  //match(BasicDBObject.parse(matchQuery))
    501          // match((List<DBObject>)JSON.parse(matchQuery)),
    502          unwind("$geoLocationCountryCode"),
    503          group("$geoLocationCountryCode", Arrays.asList(sum("count", 1),
    504                         addToSet("domain", "$basicDomain"))),
    505          sort(BasicDBObject.parse("{count : -1}"))
    506        )).forEach((Block<Document>)doc -> writeDoc(doc, writer));
    507 
    508     // casting to Block<Document> necessary because otherwise we see the error at
    509     // https://stackoverflow.com/questions/47979978/ambiguous-reference-to-foreach-when-listing-mongodbs-database-in-java
    510 
    511     // Less efficient way is to keep all the results in memory and then
    512     // write them out one at a time
    513     /*
    514     AggregateIterable<Document> output
    515         = collection.aggregate(Arrays.asList(
    516          match(andQuery),  //match(BasicDBObject.parse(matchQuery))
    517          // match((List<DBObject>)JSON.parse(matchQuery)),
    518          unwind("$geoLocationCountryCode"),
    519          group("$geoLocationCountryCode", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))),
    520          sort(BasicDBObject.parse("{count : -1}"))
    521      ));
    522 
    523    
    524     for (Document doc : output) {
    525         //System.out.println(doc);
    526         System.out.println(doc.toJson());
    527        
    528     }
    529     */
    530     return;
    531     }
    532 
    533     /** Do the aggregates for writing out tables.
    534        Table1:
    535        
    536     */
    537     public void writeTables(File outFolder) {
    538     // In this function, we're always dealing with the Websites mongodb collection.
    539     MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);
    540 
    541     String[] tableNames = { "", "1table_allCrawledSites", "2table_sitesWithPagesInMRI"};
    542     for (int tableNum = 1; tableNum < tableNames.length; tableNum++) {
    543         File outFile = new File(outFolder, tableNames[tableNum] + ".json");
    544         File csvFile = new File(outFolder, tableNames[tableNum] + ".csv");
    545         try (
    546          Writer writer = new BufferedWriter(new FileWriter(outFile));
    547          CSVPrinter csvWriter = new CSVPrinter(new FileWriter(csvFile), CSVFormat.DEFAULT);
    548          ) {
    549 
    550         // Write out the CSV column headings
    551         // https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVPrinter.html
    552         csvWriter.printRecord("countryCode", "siteCount",
    553               "numPagesInMRI count","numPagesContainingMRICount"/*, "domain"*/);
    554        
    555         AggregateIterable<Document> output = getTable(collection, tableNum); //doTable1().forEach((Block<Document>)doc -> writeDoc(doc, writer));
    556        
    557         int docNum = 0;
    558         for (Document doc : output) {       
    559             //System.out.println(doc);
    560             writeDocAsJsonRecord(++docNum, doc, writer);
    561             writeDocAsCSVRecord(++docNum, doc, csvWriter);
    562         }       
    563         logger.info("@@@ Wrote out table into file: " + Utility.getFilePath(outFile) + " and .csv");
    564         } catch(UncheckedIOException ioe) {
    565         logger.error("Caught UncheckedIOException: " + ioe.getMessage(), ioe);
    566         }
    567         catch(Exception e) {
    568         logger.error("Could not write table to file " + outFile + " or .csv equivalent" , e);
    569         }
    570     }
    571     }
    572 
    573     public AggregateIterable<Document> getTable(MongoCollection<Document> collection, int tableNum) {
    574    
    575     AggregateIterable<Document> output = null;
    576 
    577     switch(tableNum) {
    578        
    579     case 1:
    580         /* 1table_allCrawledSites -
    581 
    582            db.Websites.aggregate([       
    583            { $unwind: "$geoLocationCountryCode" },
    584            {
    585            $group: {
    586            _id: "$geoLocationCountryCode",
    587            count: { $sum: 1 },
    588            //domain: { $addToSet: '$domain' },
    589            numPagesInMRICount: { $sum: '$numPagesInMRI' },
    590            numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' }
    591            }
    592            },
    593            { $sort : { count : -1} }
    594            ]);
    595          */
    596         output = collection.aggregate(Arrays.asList(
    597                            //match(BasicDBObject.parse("{urlContainsLangCodeInPath:true}")),
    598          unwind("$geoLocationCountryCode"),
    599          group("$geoLocationCountryCode", Arrays.asList(
    600                                 sum("count", 1),
    601                                 /*addToSet("domain", "$domain"),*/
    602                                 sum("numPagesInMRICount", "$numPagesInMRI"),
    603                                 sum("numPagesContainingMRICount", "$numPagesContainingMRI"))),
    604          sort(BasicDBObject.parse("{count : -1}"))       
    605         ));
    606         break;
    607 
    608     case 2:
    609         /*
    610           db.Websites.aggregate([
    611           { $match: { numPagesInMRI: {$gt: 0} } },
    612           { $unwind: "$geoLocationCountryCode" },
    613           {
    614           $group: {
    615           _id: {$toLower: '$geoLocationCountryCode'}, // ignore toLower
    616           count: { $sum: 1 },
    617           //domain: { $addToSet: '$domain' },
    618           numPagesInMRICount: { $sum: '$numPagesInMRI' },
    619           numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' }
    620           }
    621           },
    622           { $sort : { count : -1} }
    623           ]);
    624          */
    625          output = collection.aggregate(Arrays.asList(
    626                            match(BasicDBObject.parse("{ numPagesInMRI: {$gt: 0} }")),
    627          unwind("$geoLocationCountryCode"),
    628          group("$geoLocationCountryCode", Arrays.asList(
    629                                 sum("count", 1),
    630                                 /*addToSet("domain", "$domain"),*/
    631                                 sum("numPagesInMRICount", "$numPagesInMRI"),
    632                                 sum("numPagesContainingMRICount", "$numPagesContainingMRI"))),
    633          sort(BasicDBObject.parse("{count : -1}"))       
    634         ));
    635         break;
    636        
    637     default: logger.error("Unknown table number: " + tableNum);
    638        
    639     }
    640 
    641      return output;
    642    
    643     }
    644    
    645    
    646    
    647     /**
    648      * called by lambda forEach() call on Document objects to write them out to a file.
    649      * Have to deal with unreported exceptions here that can't be dealt with when doing
    650      * the actual forEach(). See
    651      * https://stackoverflow.com/questions/39090292/how-to-cleanly-deal-with-unreported-exception-ioexception-in-stream-foreach
    652      */   
    653     public void writeDoc(Document doc, Writer writer) throws UncheckedIOException {
    654 
    655     // If there's a domain field in the json Doc, sort this domain listing alphabetically
    656     Object domainList = doc.remove("domain");
    657     if(domainList != null) {
    658         doc.put("domain", sortAlphabetically(domainList));
    659     }
    660    
    661     //OLD WAY: writer.write(doc.toJson(new JsonWriterSettings(JsonMode.STRICT, true)) + NEWLINE);
    662     // Can't control json output to add newlines after each array element,
    663     // no matter which JsonMode is used.
    664    
    665     // https://mongodb.github.io/mongo-java-driver/3.9/javadoc/index.html?org/bson/json/JsonWriterSettings.html
    666     // Still can't control array element output,
    667     // but this way uses newer mongo java driver 3.9(.1). Tried its various JsonModes too:
    668     //JsonWriterSettings writeSettings = new JsonWriterSettings();
    669     //writeSettings.builder().outputMode(JsonMode.SHELL).indent(true).build();
    670     //writer.write(doc.toJson(writeSettings) + NEWLINE);
    671 
    672     // Not the JsonWriter of mongodb java driver:
    673     // https://stackoverflow.com/questions/54746814/jsonwriter-add-a-new-line
    674    
    675     // Have to use gson's pretty print to produce a json string that contains
    676     // newlines after every array element in the json:
    677    
    678     String jsonStr = prettyPrintJson(doc.toJson());
    679     //System.err.println(jsonStr);
    680     try {
    681         writer.write(jsonStr + NEWLINE);
    682     } catch (IOException ex) {
    683         //throw ex;
    684         throw new UncheckedIOException(ex);
    685     }       
    686     }
    687 
    688     private List sortAlphabetically(Object list) {
    689     BsonArray domainList = (BsonArray)list;
    690     //for(String domain : domainList) {
    691     for(int i = domainList.size() - 1; i >= 0; i--) {
    692         BsonString domain = domainList.get(i).asString();
    693         String domainStr = Utility.stripProtocolAndWWWFromURL(domain.toString());
    694         domainList.set(i, new BsonString(domainStr));       
    695     }
    696 
    697     return domainList;
    698     }
    699    
    700     public void writeDocAsJsonRecord(int docNum, Document doc, Writer writer) throws UncheckedIOException {
    701     String jsonStr = prettyPrintJson(doc.toJson());
    702     //System.err.println(jsonStr);
    703     try {
    704         writer.write("/* " + docNum + " */\n" + jsonStr + NEWLINE);
    705     } catch (IOException ex) {
    706         //throw ex;
    707         throw new UncheckedIOException(ex);
    708     }
    709     }
    710 
    711     // TODO
    712     //public void writeDocToJsonAndCSV(int docNum, Document doc, Writer writer, CSVPrinter csvWriter) throws UncheckedIOException {
    713     public void writeDocAsCSVRecord(int docNum, Document doc, CSVPrinter csvWriter) throws UncheckedIOException {
    714     String jsonStr = doc.toJson();
    715     JsonParser parser = new JsonParser();
    716     JsonElement json = parser.parse(jsonStr);
    717 
    718     JsonObject jsonObj = (JsonObject)json;
    719    
    720     String countryCode = jsonObj.get("_id").getAsString();
    721     int siteCount = jsonObj.get("count").getAsInt();
    722     int numPagesInMRICount = jsonObj.get("numPagesInMRICount").getAsInt();
    723     int numPagesContainingMRICount = jsonObj.get("numPagesContainingMRICount").getAsInt(); 
    724    
    725     //System.err.println(jsonStr);
    726     try {
    727         //writer.write("/* " + docNum + " */\n" + prettyPrintJson(jsonStr) + NEWLINE);
    728         csvWriter.printRecord(countryCode, siteCount, numPagesInMRICount, numPagesContainingMRICount);
    729     } catch (IOException ex) {
    730         //throw ex;
    731         throw new UncheckedIOException(ex);
    732     }
    733     }
    734    
    735     public String prettyPrintJson(String jsonStr) {
    736     Gson gson = new GsonBuilder().setPrettyPrinting().create();
    737     JsonParser jp = new JsonParser();
    738     JsonElement je = jp.parse(jsonStr);
    739     String prettyJsonString = gson.toJson(je);
    740     return prettyJsonString;
    741     }
    742304
    743305   
  • other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java

    r33906 r33909  
    7777
    7878    private String domainOfSite;
    79     private String baseSiteDomain; // domainOfSite stripped of any http(s)://www.
     79    //private String baseSiteDomain; // domainOfSite stripped of any http(s)://www.
    8080    private int numPagesInMRI = 0;
    8181    private int numPagesContainingMRI = 0;
     
    203203        String url = firstPage.getPageURL();
    204204        this.domainOfSite = Utility.getDomainForURL(url, true);
    205         this.baseSiteDomain = Utility.stripProtocolAndWWWFromURL(this.domainOfSite);
     205        //this.baseSiteDomain = Utility.stripProtocolAndWWWFromURL(this.domainOfSite);
    206206    }
    207207    else {
    208208        this.domainOfSite = "UNKNOWN";
    209         this.baseSiteDomain = "UNKNOWN";
     209        //this.baseSiteDomain = "UNKNOWN";
    210210    }
    211211   
     
    343343
    344344    WebsiteInfo website = new WebsiteInfo(/*SITE_COUNTER,*/ this.siteID,
    345           this.domainOfSite, this.baseSiteDomain,
     345          this.domainOfSite, //this.baseSiteDomain,
    346346          totalPages, this.countOfWebPagesWithBodyText,
    347347          this.numPagesInMRI, this.numPagesContainingMRI,
  • other-projects/maori-lang-detection/src/org/greenstone/atea/WebPageURLsListing.java

    r33906 r33909  
    2929    static private final long FIXED_SEED = 1000;
    3030   
    31     private final MongoDBAccess mongodbAccess;
     31    private final MongoDBQueryer mongodbQueryer;
    3232    private File outFolder;
    3333
     
    4949   
    5050   
    51     public WebPageURLsListing(MongoDBAccess mongodbAccess, File outFolder)
     51    public WebPageURLsListing(MongoDBQueryer mongodbQueryer, File outFolder)
    5252    {
    53     this.mongodbAccess = mongodbAccess;
     53    this.mongodbQueryer = mongodbQueryer;
    5454    this.outFolder = outFolder;
    5555    }
     
    5757   
    5858    public void produceURLsForPagesInMRI(File domainsFile) {
    59     ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDBAccess.IS_MRI, domainsFile);
     59    ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDBQueryer.IS_MRI, domainsFile);
    6060    File outFile = new File(outFolder, "isMRI_"+domainsFile.getName());
    6161    writeURLsToFile(urlsList, outFile, urlsList.size());
     
    6666   
    6767    public void produceURLsForPagesContainingMRI(File domainsFile) {
    68     ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDBAccess.CONTAINS_MRI, domainsFile);   
     68    ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDBQueryer.CONTAINS_MRI, domainsFile);   
    6969    File outFile = new File(outFolder, "containsMRI_"+domainsFile.getName());
    7070    writeURLsToFile(urlsList, outFile, urlsList.size());
     
    9797            domain = domain.substring(0, index);
    9898            }
    99             ArrayList<String> moreURLs = mongodbAccess.queryAllMatchingURLsFilteredBy(domain, filterType);
     99            ArrayList<String> moreURLs = mongodbQueryer.queryAllMatchingURLsFilteredBy(domain, filterType);
    100100
    101101            // Print out whether there were no isMRI pages for the domain (only containsMRI). A useful thing to know
    102             if(moreURLs.size() == 0 && filterType == MongoDBAccess.IS_MRI) {
     102            if(moreURLs.size() == 0 && filterType == MongoDBQueryer.IS_MRI) {
    103103            System.out.println("   " + countryCode + " domain " + domain + " had no isMRI webpages - only containsMRI.");
    104104            }
     
    133133    public void mriWebPageListingForDomainListing(File domainsFile) {
    134134
    135     int filterType = MongoDBAccess.IS_MRI;
     135    int filterType = MongoDBQueryer.IS_MRI;
    136136   
    137137    // for overseas websites,
     
    139139
    140140    // 0. get a list of all the web pages in the given domain listing where isMRI = true
    141     ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDBAccess.IS_MRI, domainsFile);
     141    ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDBQueryer.IS_MRI, domainsFile);
    142142        // produceURLsForPagesInMRI(domainsFile);
    143143   
     
    148148    // 2. write all the URLs in urlsList to a file
    149149    //File outFolder = domainsFile.getParentFile();
    150     String fileName = (filterType == MongoDBAccess.IS_MRI) ? "isMRI_" : "containsMRI_";
     150    String fileName = (filterType == MongoDBQueryer.IS_MRI) ? "isMRI_" : "containsMRI_";
    151151    File outFile = new File(outFolder, fileName+domainsFile.getName());
    152152
     
    219219
    220220    /* ---------------------------------------- */
    221 
    222221    /**
    223      * Create the file 5counts_tentativeNonAutotranslatedSites.json
     222     * Create the file 5counts_containsMRISites_allNZGrouped.json
     223     * that contains the count and domains for NZ sites (NZ origin or nz TLD) with pages
     224     * that CONTAIN_MRI, followed by counts and domains listing for overseas sites
     225     * that CONTAIN_MRI.
     226     * @return full path of file generated
     227     */
     228    public String writeContainsMRISites_nzSitesAndTLDsGrouped() {
     229   
     230    File outFile = new File(outFolder, "5counts_containsMRISites_allNZGrouped.json");
     231
     232    String filename = Utility.getFilePath(outFile);
     233   
     234    try (
     235         Writer writer = new BufferedWriter(new FileWriter(outFile));
     236         ) {
     237        // first write out NZ sites and .nz TLD count and domains
     238        mongodbQueryer.aggregateContainsMRIForNZ(writer, MongoDBQueryer.CONTAINS_MRI);
     239        // next write out all overseas sites (not NZ origin or .nz TLD)
     240        // that have no "mi" in the URL path as mi.* or */mi
     241        boolean isMiInURLPath = false;
     242        mongodbQueryer.aggregateContainsMRIForOverseas(writer, MongoDBQueryer.CONTAINS_MRI);
     243       
     244    } catch(Exception e) {
     245        logger.error("Unable to write to file " + filename);
     246        logger.error(e.getMessage(), e);
     247    }
     248   
     249    System.err.println("*** Wrote file: " + filename);
     250
     251    return filename;
     252    }
     253   
     254    /**
     255     * Create the file 5a_counts_tentativeNonAutotranslatedSites.json
    224256     * that contains the count and domains for NZ sites (NZ origin or nz TLD) that CONTAIN_MRI
    225257     * followed by counts and domain listing for overseas sites that are either from Australia
     
    237269         ) {
    238270        // first write out NZ sites and .nz TLD count and domains
    239         mongodbAccess.aggregateContainsMRIForNZ(writer, MongoDBAccess.CONTAINS_MRI);
     271        mongodbQueryer.aggregateContainsMRIForNZ(writer, MongoDBQueryer.CONTAINS_MRI);
    240272        // next write out all overseas sites (not NZ origin or .nz TLD)
    241273        // that have no "mi" in the URL path as mi.* or */mi
    242274        boolean isMiInURLPath = false;
    243         mongodbAccess.aggregateContainsMRIForOverseas(writer, MongoDBAccess.CONTAINS_MRI, isMiInURLPath);
     275        mongodbQueryer.aggregateContainsMRIForOverseas(writer, MongoDBQueryer.CONTAINS_MRI, isMiInURLPath);
    244276       
    245277    } catch(Exception e) {
     
    254286
    255287    /**
     288     * Create the file 5b_counts_overseasSitesWithMiInPath.json
    256289     * Listing of the remainder of overseas sites that CONTAIN_MRI not included by
    257290     * writeTentativeNonAutotranslatedSites(): those that have mi in their URL path.
     
    267300         ) {
    268301        boolean isMiInURLPath = true;
    269         mongodbAccess.aggregateContainsMRIForOverseas(writer, MongoDBAccess.CONTAINS_MRI, isMiInURLPath);
     302        mongodbQueryer.aggregateContainsMRIForOverseas(writer, MongoDBQueryer.CONTAINS_MRI, isMiInURLPath);
    270303
    271304    } catch(Exception e) {
     
    300333
    301334    try (
    302          MongoDBAccess mongodb = new MongoDBAccess();
     335         MongoDBQueryer mongodb = new MongoDBQueryer();
    303336         ) {
    304337
     
    335368        //System.err.println("For N = " + 4360 + ", n = " + listing.calcSampleSize(4360));
    336369        //System.err.println("For N = " + 681 + ", n = " + listing.calcSampleSize(681));
    337        
    338         String filename = listing.writeTentativeNonAutotranslatedSites();
     370
     371        // get all sites where >0 pages have containsMRI=true
     372        // grouping NZ sites and .nz TLDs together and remainder under overseas
     373        // geolocations.
     374        String filename = listing.writeContainsMRISites_nzSitesAndTLDsGrouped();
     375
     376        // separately:
     377        // - all NZ containsMRI + overseas tentative non-product sites with containMRI
     378        // - overseas tentative product sites with containMRI
     379        filename = listing.writeTentativeNonAutotranslatedSites();
    339380        filename = listing.writeOverseasSitesWithMiInURLPath();
    340381
  • other-projects/maori-lang-detection/src/org/greenstone/atea/morphia/WebsiteInfo.java

    r33906 r33909  
    99    public final String siteFolderName;
    1010    public final String domain;
    11     public final String basicDomain; // domain without protocol and www. prefix
     11    //public final String basicDomain; // domain without protocol and www. prefix
    1212   
    1313    public final int totalPages;
     
    2525   
    2626    public WebsiteInfo(/*int siteCount,*/ String siteFolderName,
    27                String domainOfSite, String baseSiteDomain,
     27               String domainOfSite, //String baseSiteDomain,
    2828               int totalPages, int countOfWebPagesWithBodyText,
    2929               int numPagesInMRI, int numPagesContainingMRI,
     
    3434    this.siteFolderName = siteFolderName;
    3535    this.domain = domainOfSite;
    36     this.basicDomain = baseSiteDomain;
     36    //this.basicDomain = baseSiteDomain;
    3737   
    3838    this.totalPages = totalPages;
Note: See TracChangeset for help on using the changeset viewer.