Ignore:
Timestamp:
2020-02-12T19:02:44+13:00 (4 years ago)
Author:
ak19
Message:
  1. Implementing tables 3 to 5. 2. Rolled back the introduction of the basicDomain field (domain stripped of http/https and www prefixes) as the code can create and sort this field alphabetically, whereas it didn't sort properly in mongodb. 3. The code now does sort the domains stripped of protocol and www for the mongodb queries producing domain results and ensures the domain list is unique. 4. Split the MongoDBAccess class into 2, with the connection code in MongoDBAccess.java and the querying code in MongoDBQueryer (a subclass of MongoDBAccess) that is so far exclusively used by WebPageURLsListing.java
File:
1 edited

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/WebPageURLsListing.java

    r33906 r33909  
    2929    static private final long FIXED_SEED = 1000;
    3030   
    31     private final MongoDBAccess mongodbAccess;
     31    private final MongoDBQueryer mongodbQueryer;
    3232    private File outFolder;
    3333
     
    4949   
    5050   
    51     public WebPageURLsListing(MongoDBAccess mongodbAccess, File outFolder)
     51    public WebPageURLsListing(MongoDBQueryer mongodbQueryer, File outFolder)
    5252    {
    53     this.mongodbAccess = mongodbAccess;
     53    this.mongodbQueryer = mongodbQueryer;
    5454    this.outFolder = outFolder;
    5555    }
     
    5757   
    5858    public void produceURLsForPagesInMRI(File domainsFile) {
    59     ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDBAccess.IS_MRI, domainsFile);
     59    ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDBQueryer.IS_MRI, domainsFile);
    6060    File outFile = new File(outFolder, "isMRI_"+domainsFile.getName());
    6161    writeURLsToFile(urlsList, outFile, urlsList.size());
     
    6666   
    6767    public void produceURLsForPagesContainingMRI(File domainsFile) {
    68     ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDBAccess.CONTAINS_MRI, domainsFile);   
     68    ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDBQueryer.CONTAINS_MRI, domainsFile);   
    6969    File outFile = new File(outFolder, "containsMRI_"+domainsFile.getName());
    7070    writeURLsToFile(urlsList, outFile, urlsList.size());
     
    9797            domain = domain.substring(0, index);
    9898            }
    99             ArrayList<String> moreURLs = mongodbAccess.queryAllMatchingURLsFilteredBy(domain, filterType);
     99            ArrayList<String> moreURLs = mongodbQueryer.queryAllMatchingURLsFilteredBy(domain, filterType);
    100100
    101101            // Print out whether there were no isMRI pages for the domain (only containsMRI). A useful thing to know
    102             if(moreURLs.size() == 0 && filterType == MongoDBAccess.IS_MRI) {
     102            if(moreURLs.size() == 0 && filterType == MongoDBQueryer.IS_MRI) {
    103103            System.out.println("   " + countryCode + " domain " + domain + " had no isMRI webpages - only containsMRI.");
    104104            }
     
    133133    public void mriWebPageListingForDomainListing(File domainsFile) {
    134134
    135     int filterType = MongoDBAccess.IS_MRI;
     135    int filterType = MongoDBQueryer.IS_MRI;
    136136   
    137137    // for overseas websites,
     
    139139
    140140    // 0. get a list of all the web pages in the given domain listing where isMRI = true
    141     ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDBAccess.IS_MRI, domainsFile);
     141    ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDBQueryer.IS_MRI, domainsFile);
    142142        // produceURLsForPagesInMRI(domainsFile);
    143143   
     
    148148    // 2. write all the URLs in urlsList to a file
    149149    //File outFolder = domainsFile.getParentFile();
    150     String fileName = (filterType == MongoDBAccess.IS_MRI) ? "isMRI_" : "containsMRI_";
     150    String fileName = (filterType == MongoDBQueryer.IS_MRI) ? "isMRI_" : "containsMRI_";
    151151    File outFile = new File(outFolder, fileName+domainsFile.getName());
    152152
     
    219219
    220220    /* ---------------------------------------- */
    221 
    222221    /**
    223      * Create the file 5counts_tentativeNonAutotranslatedSites.json
     222     * Create the file 5counts_containsMRISites_allNZGrouped.json
     223     * that contains the count and domains for NZ sites (NZ origin or nz TLD) with pages
     224     * that CONTAIN_MRI, followed by counts and domains listing for overseas sites
     225     * that CONTAIN_MRI.
     226     * @return full path of file generated
     227     */
     228    public String writeContainsMRISites_nzSitesAndTLDsGrouped() {
     229   
     230    File outFile = new File(outFolder, "5counts_containsMRISites_allNZGrouped.json");
     231
     232    String filename = Utility.getFilePath(outFile);
     233   
     234    try (
     235         Writer writer = new BufferedWriter(new FileWriter(outFile));
     236         ) {
     237        // first write out NZ sites and .nz TLD count and domains
     238        mongodbQueryer.aggregateContainsMRIForNZ(writer, MongoDBQueryer.CONTAINS_MRI);
     239        // next write out all overseas sites (not NZ origin or .nz TLD)
     240        // that have no "mi" in the URL path as mi.* or */mi
     241        boolean isMiInURLPath = false;
     242        mongodbQueryer.aggregateContainsMRIForOverseas(writer, MongoDBQueryer.CONTAINS_MRI);
     243       
     244    } catch(Exception e) {
     245        logger.error("Unable to write to file " + filename);
     246        logger.error(e.getMessage(), e);
     247    }
     248   
     249    System.err.println("*** Wrote file: " + filename);
     250
     251    return filename;
     252    }
     253   
     254    /**
     255     * Create the file 5a_counts_tentativeNonAutotranslatedSites.json
    224256     * that contains the count and domains for NZ sites (NZ origin or nz TLD) that CONTAIN_MRI
    225257     * followed by counts and domain listing for overseas sites that are either from Australia
     
    237269         ) {
    238270        // first write out NZ sites and .nz TLD count and domains
    239         mongodbAccess.aggregateContainsMRIForNZ(writer, MongoDBAccess.CONTAINS_MRI);
     271        mongodbQueryer.aggregateContainsMRIForNZ(writer, MongoDBQueryer.CONTAINS_MRI);
    240272        // next write out all overseas sites (not NZ origin or .nz TLD)
    241273        // that have no "mi" in the URL path as mi.* or */mi
    242274        boolean isMiInURLPath = false;
    243         mongodbAccess.aggregateContainsMRIForOverseas(writer, MongoDBAccess.CONTAINS_MRI, isMiInURLPath);
     275        mongodbQueryer.aggregateContainsMRIForOverseas(writer, MongoDBQueryer.CONTAINS_MRI, isMiInURLPath);
    244276       
    245277    } catch(Exception e) {
     
    254286
    255287    /**
     288     * Create the file 5b_counts_overseasSitesWithMiInPath.json
    256289     * Listing of the remainder of overseas sites that CONTAIN_MRI not included by
    257290     * writeTentativeNonAutotranslatedSites(): those that have mi in their URL path.
     
    267300         ) {
    268301        boolean isMiInURLPath = true;
    269         mongodbAccess.aggregateContainsMRIForOverseas(writer, MongoDBAccess.CONTAINS_MRI, isMiInURLPath);
     302        mongodbQueryer.aggregateContainsMRIForOverseas(writer, MongoDBQueryer.CONTAINS_MRI, isMiInURLPath);
    270303
    271304    } catch(Exception e) {
     
    300333
    301334    try (
    302          MongoDBAccess mongodb = new MongoDBAccess();
     335         MongoDBQueryer mongodb = new MongoDBQueryer();
    303336         ) {
    304337
     
    335368        //System.err.println("For N = " + 4360 + ", n = " + listing.calcSampleSize(4360));
    336369        //System.err.println("For N = " + 681 + ", n = " + listing.calcSampleSize(681));
    337        
    338         String filename = listing.writeTentativeNonAutotranslatedSites();
     370
     371        // get all sites where >0 pages have containsMRI=true
     372        // grouping NZ sites and .nz TLDs together and remainder under overseas
     373        // geolocations.
     374        String filename = listing.writeContainsMRISites_nzSitesAndTLDsGrouped();
     375
     376        // separately:
     377        // - all NZ containsMRI + overseas tentative non-product sites with containMRI
     378        // - overseas tentative product sites with containMRI
     379        filename = listing.writeTentativeNonAutotranslatedSites();
    339380        filename = listing.writeOverseasSitesWithMiInURLPath();
    340381
Note: See TracChangeset for help on using the changeset viewer.