Changeset 33906


Ignore:
Timestamp:
2020-02-05T23:36:37+13:00 (4 years ago)
Author:
ak19
Message:

Code is intermediate state. 1. Introduced basicDomain field to MongoDB and recreated the MongoDB tables/collections, this will help discount duplicated domains under http and https, with and without www. Though webpage URLs may potentially still be unique and not duplicated across all 4 possible variants, I want them counted under the same base domain name. 2. Another issue noticed now is that some of the sites appear to be hosted on multiple countries servers, and so slightly different country code counts and domainlistings are returned. 3. So added code modifications (untested) to sort the domains alphabetically after stripping protocol and www to allow comparing the old domainListing results of MongoDB's now renamed oldWebsites and oldWebpages collections to the new versions of these collections and to then update the differences in manual counts.

Location:
other-projects/maori-lang-detection/src/org/greenstone/atea
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java

    r33887 r33906  
    2626import com.mongodb.Block;
    2727
     28import org.bson.BsonArray;
     29import org.bson.BsonString;
    2830import org.bson.Document;
    2931import org.bson.conversions.Bson;
     
    217219        .append("siteFolderName", website.siteFolderName)
    218220        .append("domain", website.domain)
     221        .append("basicDomain", website.basicDomain)
    219222        .append("totalPages", website.totalPages)
    220223        .append("numPagesWithBodyText", website.countOfWebPagesWithBodyText)
     
    389392     *  and https://stackoverflow.com/questions/47979978/ambiguous-reference-to-foreach-when-listing-mongodbs-database-in-java
    390393     *
    391      *
     394     * Count by country code of non-NZ websites containing a positive number of sentences in MRI,
     395     * listing all the base domain strings (no protocol or www) in ALPHABETICAL ORDER
     396     * and total counts of numPagesInMRI and numPagesContainingMRI across all these
     397     * matching sites.
     398     *
    392399     * The mongodb aggregate() we want to run this time:
    393400     *
     
    405412          $group: {
    406413            _id: "nz",
    407             count: { $sum: 1 },
    408             domain: { $addToSet: '$domain' }
     414            count: { $sum: 1 },           
     415        domain: { $addToSet: '$basicDomain' } // domain: {$push: "$basicDomain" }
    409416          }
    410417    },
     
    431438         match(andQuery),
    432439         unwind("$geoLocationCountryCode"),
    433          group("NZ", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))),
     440         group("NZ", Arrays.asList(sum("count", 1),
     441                   addToSet("domain", "$basicDomain"))),
    434442         sort(BasicDBObject.parse("{count : -1}"))
    435443     )).forEach((Block<Document>)doc -> writeDoc(doc, writer));
     
    441449   
    442450    /**
     451     * Count of NZ (incl .nz TLD)  websites containing a positive number of sentences in MRI,
     452     * listing all the base domain strings (no protocol or www) in ALPHABETICAL ORDER
     453     * and total counts of numPagesInMRI and numPagesContainingMRI across all these
     454     * matching sites.
     455     *
    443456     * The aggregate() we want to run this time:
    444457     *
     
    459472            _id: {$toLower: '$geoLocationCountryCode'},
    460473            count: { $sum: 1 },
    461             domain: { $addToSet: '$domain' }
     474        domain: { $addToSet: '$basicDomain' } // domain: {$push: "$basicDomain" }
    462475          }
    463476     },
     
    484497        orQuery);
    485498
    486 
    487499    collection.aggregate(Arrays.asList(
    488500         match(andQuery),  //match(BasicDBObject.parse(matchQuery))
    489501         // match((List<DBObject>)JSON.parse(matchQuery)),
    490502         unwind("$geoLocationCountryCode"),
    491          group("$geoLocationCountryCode", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))),
     503         group("$geoLocationCountryCode", Arrays.asList(sum("count", 1),
     504                        addToSet("domain", "$basicDomain"))),
    492505         sort(BasicDBObject.parse("{count : -1}"))
    493506       )).forEach((Block<Document>)doc -> writeDoc(doc, writer));
     
    639652     */   
    640653    public void writeDoc(Document doc, Writer writer) throws UncheckedIOException {
     654
     655    // If there's a domain field in the json Doc, sort this domain listing alphabetically
     656    Object domainList = doc.remove("domain");
     657    if(domainList != null) {
     658        doc.put("domain", sortAlphabetically(domainList));
     659    }
     660   
    641661    //OLD WAY: writer.write(doc.toJson(new JsonWriterSettings(JsonMode.STRICT, true)) + NEWLINE);
    642662    // Can't control json output to add newlines after each array element,
     
    655675    // Have to use gson's pretty print to produce a json string that contains
    656676    // newlines after every array element in the json:
     677   
    657678    String jsonStr = prettyPrintJson(doc.toJson());
    658679    //System.err.println(jsonStr);
     
    663684        throw new UncheckedIOException(ex);
    664685    }       
     686    }
     687
     688    private List sortAlphabetically(Object list) {
     689    BsonArray domainList = (BsonArray)list;
     690    //for(String domain : domainList) {
     691    for(int i = domainList.size() - 1; i >= 0; i--) {
     692        BsonString domain = domainList.get(i).asString();
     693        String domainStr = Utility.stripProtocolAndWWWFromURL(domain.toString());
     694        domainList.set(i, new BsonString(domainStr));       
     695    }
     696
     697    return domainList;
    665698    }
    666699   
  • other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java

    r33811 r33906  
    7575    private String geoLocationCountryCode = null; /** 2 letter country code */
    7676    private boolean urlContainsLangCodeInPath = false; /** If any URL on this site contains a /mi(/) or http(s)://mi.* in its URL path */
    77    
     77
    7878    private String domainOfSite;
     79    private String baseSiteDomain; // domainOfSite stripped of any http(s)://www.
    7980    private int numPagesInMRI = 0;
    8081    private int numPagesContainingMRI = 0;
     
    202203        String url = firstPage.getPageURL();
    203204        this.domainOfSite = Utility.getDomainForURL(url, true);
     205        this.baseSiteDomain = Utility.stripProtocolAndWWWFromURL(this.domainOfSite);
    204206    }
    205207    else {
    206208        this.domainOfSite = "UNKNOWN";
     209        this.baseSiteDomain = "UNKNOWN";
    207210    }
    208211   
     
    339342    int totalPages = pages.size(); 
    340343
    341     WebsiteInfo website = new WebsiteInfo(/*SITE_COUNTER,*/ this.siteID, this.domainOfSite,
     344    WebsiteInfo website = new WebsiteInfo(/*SITE_COUNTER,*/ this.siteID,
     345          this.domainOfSite, this.baseSiteDomain,
    342346          totalPages, this.countOfWebPagesWithBodyText,
    343347          this.numPagesInMRI, this.numPagesContainingMRI,
  • other-projects/maori-lang-detection/src/org/greenstone/atea/WebPageURLsListing.java

    r33887 r33906  
    5757   
    5858    public void produceURLsForPagesInMRI(File domainsFile) {
    59     ArrayList<Tuple> urlsList = getURLsForWebPages(MongoDBAccess.IS_MRI, domainsFile);
     59    ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDBAccess.IS_MRI, domainsFile);
    6060    File outFile = new File(outFolder, "isMRI_"+domainsFile.getName());
    6161    writeURLsToFile(urlsList, outFile, urlsList.size());
     
    6666   
    6767    public void produceURLsForPagesContainingMRI(File domainsFile) {
    68     ArrayList<Tuple> urlsList = getURLsForWebPages(MongoDBAccess.CONTAINS_MRI, domainsFile);   
     68    ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDBAccess.CONTAINS_MRI, domainsFile);   
    6969    File outFile = new File(outFolder, "containsMRI_"+domainsFile.getName());
    7070    writeURLsToFile(urlsList, outFile, urlsList.size());
     
    7474    }
    7575   
    76     private ArrayList<Tuple> getURLsForWebPages(int filterType, File domainsFile) {
     76    private ArrayList<Tuple> getURLsForAllWebPagesInSiteListing(int filterType, File domainsFile) {
    7777    ArrayList<Tuple> urlsList = new ArrayList<Tuple>();
    7878   
     
    120120    }
    121121   
    122     /** Given a hand curated list of NZ sites with positive numPagesContainingMRI,
    123      * get a listing of all their web pages IN_MRI (or CONTAINS_MRI?).
    124      * Total all these pages in MRI (N), then work out the correct sample size (n)
     122    /** Given a hand curated list of all sites with positive numPagesContainingMRI
     123     * determined by manual inspection, get a listing of all their web pages that
     124     * are IN_MRI (or CONTAINS_MRI?).
     125     * Total all these pages that are inMRI (N), then work out the correct sample size (n)
    125126     * at 90% confidence with 5% margin of error. Then generate a random listing
    126127     * of n of these pages in MRI of these trusted sites and output to a file
    127      * for manual inspection. */
     128     * for manual inspection of the sample webpage URLs at page-level. */
    128129    /* OLD: Given a hand curated list of non-NZ sites that CONTAINS_MRI, get a listing
    129130     * of all their web pages IN_MRI (or CONTAINS_MRI).
     
    138139
    139140    // 0. get a list of all the web pages in the given domain listing where isMRI = true
    140     ArrayList<Tuple> urlsList = getURLsForWebPages(MongoDBAccess.IS_MRI, domainsFile);
     141    ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDBAccess.IS_MRI, domainsFile);
    141142        // produceURLsForPagesInMRI(domainsFile);
    142143   
  • other-projects/maori-lang-detection/src/org/greenstone/atea/morphia/WebsiteInfo.java

    r33811 r33906  
    99    public final String siteFolderName;
    1010    public final String domain;
     11    public final String basicDomain; // domain without protocol and www. prefix
    1112   
    1213    public final int totalPages;
     
    2324    public final boolean urlContainsLangCodeInPath;
    2425   
    25     public WebsiteInfo(/*int siteCount,*/ String siteFolderName, String domainOfSite,
     26    public WebsiteInfo(/*int siteCount,*/ String siteFolderName,
     27               String domainOfSite, String baseSiteDomain,
    2628               int totalPages, int countOfWebPagesWithBodyText,
    2729               int numPagesInMRI, int numPagesContainingMRI,
     
    3234    this.siteFolderName = siteFolderName;
    3335    this.domain = domainOfSite;
     36    this.basicDomain = baseSiteDomain;
    3437   
    3538    this.totalPages = totalPages;
Note: See TracChangeset for help on using the changeset viewer.