Changeset 33906

Show
Ignore:
Timestamp:
05.02.2020 23:36:37 (13 days ago)
Author:
ak19
Message:

Code is intermediate state. 1. Introduced basicDomain field to MongoDB and recreated the MongoDB tables/collections, this will help discount duplicated domains under http and https, with and without www. Though webpage URLs may potentially still be unique and not duplicated across all 4 possible variants, I want them counted under the same base domain name. 2. Another issue noticed now is that some of the sites appear to be hosted on multiple countries servers, and so slightly different country code counts and domainlistings are returned. 3. So added code modifications (untested) to sort the domains alphabetically after stripping protocol and www to allow comparing the old domainListing results of MongoDB's now renamed oldWebsites and oldWebpages collections to the new versions of these collections and to then update the differences in manual counts.

Location:
other-projects/maori-lang-detection/src/org/greenstone/atea
Files:
4 modified

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java

    r33887 r33906  
    2626import com.mongodb.Block; 
    2727 
     28import org.bson.BsonArray; 
     29import org.bson.BsonString; 
    2830import org.bson.Document; 
    2931import org.bson.conversions.Bson; 
     
    217219        .append("siteFolderName", website.siteFolderName)  
    218220        .append("domain", website.domain)  
     221        .append("basicDomain", website.basicDomain)  
    219222        .append("totalPages", website.totalPages) 
    220223        .append("numPagesWithBodyText", website.countOfWebPagesWithBodyText) 
     
    389392     *  and https://stackoverflow.com/questions/47979978/ambiguous-reference-to-foreach-when-listing-mongodbs-database-in-java 
    390393     * 
    391      * 
     394     * Count by country code of non-NZ websites containing a positive number of sentences in MRI, 
     395     * listing all the base domain strings (no protocol or www) in ALPHABETICAL ORDER 
     396     * and total counts of numPagesInMRI and numPagesContainingMRI across all these 
     397     * matching sites. 
     398     *  
    392399     * The mongodb aggregate() we want to run this time: 
    393400     * 
     
    405412          $group: { 
    406413            _id: "nz", 
    407             count: { $sum: 1 }, 
    408             domain: { $addToSet: '$domain' } 
     414            count: { $sum: 1 },             
     415        domain: { $addToSet: '$basicDomain' } // domain: {$push: "$basicDomain" } 
    409416          } 
    410417    }, 
     
    431438         match(andQuery), 
    432439         unwind("$geoLocationCountryCode"), 
    433          group("NZ", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))), 
     440         group("NZ", Arrays.asList(sum("count", 1), 
     441                   addToSet("domain", "$basicDomain"))), 
    434442         sort(BasicDBObject.parse("{count : -1}")) 
    435443     )).forEach((Block<Document>)doc -> writeDoc(doc, writer)); 
     
    441449     
    442450    /** 
     451     * Count of NZ (incl .nz TLD)  websites containing a positive number of sentences in MRI,  
     452     * listing all the base domain strings (no protocol or www) in ALPHABETICAL ORDER 
     453     * and total counts of numPagesInMRI and numPagesContainingMRI across all these 
     454     * matching sites. 
     455     * 
    443456     * The aggregate() we want to run this time: 
    444457     * 
     
    459472            _id: {$toLower: '$geoLocationCountryCode'}, 
    460473            count: { $sum: 1 }, 
    461             domain: { $addToSet: '$domain' } 
     474        domain: { $addToSet: '$basicDomain' } // domain: {$push: "$basicDomain" } 
    462475          } 
    463476     }, 
     
    484497        orQuery); 
    485498 
    486  
    487499    collection.aggregate(Arrays.asList( 
    488500         match(andQuery),  //match(BasicDBObject.parse(matchQuery)) 
    489501         // match((List<DBObject>)JSON.parse(matchQuery)), 
    490502         unwind("$geoLocationCountryCode"), 
    491          group("$geoLocationCountryCode", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))), 
     503         group("$geoLocationCountryCode", Arrays.asList(sum("count", 1),  
     504                        addToSet("domain", "$basicDomain"))), 
    492505         sort(BasicDBObject.parse("{count : -1}")) 
    493506       )).forEach((Block<Document>)doc -> writeDoc(doc, writer)); 
     
    639652     */     
    640653    public void writeDoc(Document doc, Writer writer) throws UncheckedIOException { 
     654 
     655    // If there's a domain field in the json Doc, sort this domain listing alphabetically 
     656    Object domainList = doc.remove("domain"); 
     657    if(domainList != null) { 
     658        doc.put("domain", sortAlphabetically(domainList)); 
     659    } 
     660     
    641661    //OLD WAY: writer.write(doc.toJson(new JsonWriterSettings(JsonMode.STRICT, true)) + NEWLINE); 
    642662    // Can't control json output to add newlines after each array element, 
     
    655675    // Have to use gson's pretty print to produce a json string that contains 
    656676    // newlines after every array element in the json: 
     677     
    657678    String jsonStr = prettyPrintJson(doc.toJson()); 
    658679    //System.err.println(jsonStr); 
     
    663684        throw new UncheckedIOException(ex); 
    664685    }        
     686    } 
     687 
     688    private List sortAlphabetically(Object list) { 
     689    BsonArray domainList = (BsonArray)list; 
     690    //for(String domain : domainList) { 
     691    for(int i = domainList.size() - 1; i >= 0; i--) { 
     692        BsonString domain = domainList.get(i).asString(); 
     693        String domainStr = Utility.stripProtocolAndWWWFromURL(domain.toString()); 
     694        domainList.set(i, new BsonString(domainStr));        
     695    } 
     696 
     697    return domainList; 
    665698    } 
    666699     
  • other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java

    r33811 r33906  
    7575    private String geoLocationCountryCode = null; /** 2 letter country code */ 
    7676    private boolean urlContainsLangCodeInPath = false; /** If any URL on this site contains a /mi(/) or http(s)://mi.* in its URL path */ 
    77      
     77 
    7878    private String domainOfSite; 
     79    private String baseSiteDomain; // domainOfSite stripped of any http(s)://www. 
    7980    private int numPagesInMRI = 0; 
    8081    private int numPagesContainingMRI = 0; 
     
    202203        String url = firstPage.getPageURL(); 
    203204        this.domainOfSite = Utility.getDomainForURL(url, true); 
     205        this.baseSiteDomain = Utility.stripProtocolAndWWWFromURL(this.domainOfSite); 
    204206    } 
    205207    else { 
    206208        this.domainOfSite = "UNKNOWN"; 
     209        this.baseSiteDomain = "UNKNOWN"; 
    207210    } 
    208211     
     
    339342    int totalPages = pages.size();   
    340343 
    341     WebsiteInfo website = new WebsiteInfo(/*SITE_COUNTER,*/ this.siteID, this.domainOfSite, 
     344    WebsiteInfo website = new WebsiteInfo(/*SITE_COUNTER,*/ this.siteID, 
     345          this.domainOfSite, this.baseSiteDomain, 
    342346          totalPages, this.countOfWebPagesWithBodyText, 
    343347          this.numPagesInMRI, this.numPagesContainingMRI, 
  • other-projects/maori-lang-detection/src/org/greenstone/atea/WebPageURLsListing.java

    r33887 r33906  
    5757     
    5858    public void produceURLsForPagesInMRI(File domainsFile) { 
    59     ArrayList<Tuple> urlsList = getURLsForWebPages(MongoDBAccess.IS_MRI, domainsFile); 
     59    ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDBAccess.IS_MRI, domainsFile); 
    6060    File outFile = new File(outFolder, "isMRI_"+domainsFile.getName()); 
    6161    writeURLsToFile(urlsList, outFile, urlsList.size()); 
     
    6666     
    6767    public void produceURLsForPagesContainingMRI(File domainsFile) { 
    68     ArrayList<Tuple> urlsList = getURLsForWebPages(MongoDBAccess.CONTAINS_MRI, domainsFile);     
     68    ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDBAccess.CONTAINS_MRI, domainsFile);     
    6969    File outFile = new File(outFolder, "containsMRI_"+domainsFile.getName()); 
    7070    writeURLsToFile(urlsList, outFile, urlsList.size()); 
     
    7474    } 
    7575     
    76     private ArrayList<Tuple> getURLsForWebPages(int filterType, File domainsFile) { 
     76    private ArrayList<Tuple> getURLsForAllWebPagesInSiteListing(int filterType, File domainsFile) { 
    7777    ArrayList<Tuple> urlsList = new ArrayList<Tuple>(); 
    7878     
     
    120120    } 
    121121     
    122     /** Given a hand curated list of NZ sites with positive numPagesContainingMRI, 
    123      * get a listing of all their web pages IN_MRI (or CONTAINS_MRI?).  
    124      * Total all these pages in MRI (N), then work out the correct sample size (n) 
     122    /** Given a hand curated list of all sites with positive numPagesContainingMRI 
     123     * determined by manual inspection, get a listing of all their web pages that 
     124     * are IN_MRI (or CONTAINS_MRI?).  
     125     * Total all these pages that are inMRI (N), then work out the correct sample size (n) 
    125126     * at 90% confidence with 5% margin of error. Then generate a random listing 
    126127     * of n of these pages in MRI of these trusted sites and output to a file 
    127      * for manual inspection. */ 
     128     * for manual inspection of the sample webpage URLs at page-level. */ 
    128129    /* OLD: Given a hand curated list of non-NZ sites that CONTAINS_MRI, get a listing 
    129130     * of all their web pages IN_MRI (or CONTAINS_MRI). 
     
    138139 
    139140    // 0. get a list of all the web pages in the given domain listing where isMRI = true 
    140     ArrayList<Tuple> urlsList = getURLsForWebPages(MongoDBAccess.IS_MRI, domainsFile); 
     141    ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDBAccess.IS_MRI, domainsFile); 
    141142        // produceURLsForPagesInMRI(domainsFile); 
    142143     
  • other-projects/maori-lang-detection/src/org/greenstone/atea/morphia/WebsiteInfo.java

    r33811 r33906  
    99    public final String siteFolderName; 
    1010    public final String domain; 
     11    public final String basicDomain; // domain without protocol and www. prefix 
    1112     
    1213    public final int totalPages; 
     
    2324    public final boolean urlContainsLangCodeInPath; 
    2425     
    25     public WebsiteInfo(/*int siteCount,*/ String siteFolderName, String domainOfSite, 
     26    public WebsiteInfo(/*int siteCount,*/ String siteFolderName, 
     27               String domainOfSite, String baseSiteDomain, 
    2628               int totalPages, int countOfWebPagesWithBodyText, 
    2729               int numPagesInMRI, int numPagesContainingMRI, 
     
    3234    this.siteFolderName = siteFolderName; 
    3335    this.domain = domainOfSite; 
     36    this.basicDomain = baseSiteDomain; 
    3437     
    3538    this.totalPages = totalPages;