Changeset 33884

Show
Ignore:
Timestamp:
31.01.2020 22:21:40 (3 weeks ago)
Author:
ak19
Message:

0. Previous commit had lots of modifications, and only 2 files matched the simple commit message of clarifications. The code changes in the prev commit were to incorporate the processing of a domains File (of curated sites) and write out all webPages in ach of those sites where isMRI=true. And then calculate a representative sample size n out of N total isMRI webPages, then shuffle that list of isMRI webPages and write out the first n webPage URLs in that list. 1. This commit: incorporating country code alongside URLs as Dr Bainbridge requested.

Location:
other-projects/maori-lang-detection
Files:
2 modified

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/mongodb-data/6table_nonProductSites1_manualShortlist.json

    r33883 r33884  
    9696 
    9797 
    98  
    99 ï»¿"_id","siteCount containsMRI","numPagesInMRICount","numPagesContainingMRICount","URLs of pages detected as inMRI" 
     98// To add column: "URLs of pages detected as inMRI" 
     99"_id","siteCount containsMRI","numPagesInMRICount","numPagesContainingMRICount" 
    100100"nz","176.0","4360","9641" 
    101101"us","29.0","681","953" 
  • other-projects/maori-lang-detection/src/org/greenstone/atea/WebPageURLsListing.java

    r33883 r33884  
    3030    private File outFolder; 
    3131 
     32 
     33     
     34    public static class Tuple { 
     35    public final String url; 
     36    public final String countryCode; 
     37     
     38    public Tuple(String url, String countryCode) { 
     39        this.url = url; 
     40        this.countryCode = countryCode; 
     41    } 
     42 
     43    public String toString() { 
     44        return this.url + "," + countryCode; 
     45    } 
     46    } 
     47     
    3248     
    3349    public WebPageURLsListing(MongoDBAccess mongodbAccess, File outFolder) 
     
    4662     
    4763    public void produceURLsForPagesInMRI(File domainsFile) { 
    48     ArrayList<String> urlsList = getURLsForWebPages(MongoDBAccess.IS_MRI, domainsFile); 
     64    ArrayList<Tuple> urlsList = getURLsForWebPages(MongoDBAccess.IS_MRI, domainsFile); 
    4965    File outFile = new File(outFolder, "isMRI_"+domainsFile.getName()); 
    5066    writeURLsToFile(urlsList, outFile, urlsList.size()); 
     
    5571     
    5672    public void produceURLsForPagesContainingMRI(File domainsFile) { 
    57     ArrayList<String> urlsList = getURLsForWebPages(MongoDBAccess.CONTAINS_MRI, domainsFile);    
     73    ArrayList<Tuple> urlsList = getURLsForWebPages(MongoDBAccess.CONTAINS_MRI, domainsFile);     
    5874    File outFile = new File(outFolder, "containsMRI_"+domainsFile.getName()); 
    5975    writeURLsToFile(urlsList, outFile, urlsList.size()); 
     
    6379    } 
    6480     
    65     private ArrayList<String> getURLsForWebPages(int filterType, File domainsFile) { 
    66     ArrayList<String> urlsList = new ArrayList<String>(); 
     81    private ArrayList<Tuple> getURLsForWebPages(int filterType, File domainsFile) { 
     82    ArrayList<Tuple> urlsList = new ArrayList<Tuple>(); 
    6783     
    6884    // 1. read each url from the domainsFile 
     
    7995        domain = domain.trim(); 
    8096        if(!domain.equals("")) { 
     97             
     98            String countryCode = ""; 
     99            int index = domain.lastIndexOf(","); 
     100            if(index != -1) { 
     101            countryCode = domain.substring(index+1).trim(); 
     102            domain = domain.substring(0, index); 
     103            } 
    81104            ArrayList<String> moreURLs = mongodbAccess.queryAllMatchingURLsFilteredBy(domain, filterType); 
    82             urlsList.addAll(moreURLs); 
     105 
     106            // Print out whether there were no isMRI pages for the domain (only containsMRI). A useful thing to know 
    83107            if(moreURLs.size() == 0 && filterType == MongoDBAccess.IS_MRI) { 
    84             System.out.println("Domain " + domain + " had no isMRI webpages (only containsMRI)."); 
     108            System.out.println("   " + countryCode + " domain " + domain + " had no isMRI webpages - only containsMRI."); 
    85109            } 
     110 
     111            //urlsList.addAll(moreURLs); 
     112            for(int i = 0; i < moreURLs.size(); i++) { 
     113            urlsList.add(new Tuple(moreURLs.get(i), countryCode)); 
     114            } 
     115             
    86116        } 
    87117        } 
     118        System.err.println(""); 
    88119    } catch(Exception e) { 
    89120        logger.error("Unable to read URLs from file " + getFilePath(domainsFile)); 
     
    112143 
    113144    // 0. get a list of all the web pages in the given domain listing where isMRI = true 
    114     ArrayList<String> urlsList = getURLsForWebPages(MongoDBAccess.IS_MRI, domainsFile); 
     145    ArrayList<Tuple> urlsList = getURLsForWebPages(MongoDBAccess.IS_MRI, domainsFile); 
    115146        // produceURLsForPagesInMRI(domainsFile); 
    116147     
     
    174205     * Writes out the first n URLs in urlsList into outFile. 
    175206     */ 
    176     private void writeURLsToFile(ArrayList<String> urlsList, File outFile, final int n) { 
     207    private void writeURLsToFile(ArrayList<Tuple> urlsList, File outFile, final int n) { 
    177208    try ( 
    178209         Writer writer = new BufferedWriter(new FileWriter(outFile)); 
     
    180211 
    181212        for (int i=0; i < n; i++) { 
    182         String url = urlsList.get(i); 
     213        Tuple urlInfo = urlsList.get(i); 
     214         
    183215        //System.out.println(list.get(i)); 
    184         writer.write(url + "\n"); 
     216        writer.write(urlInfo + "\n"); // calls toString() on tuple of url -> countryCode 
    185217        } 
    186218    } catch(Exception e) { 
     
    281313        WebPageURLsListing listing = new WebPageURLsListing(mongodb, outFolder); 
    282314 
     315        System.out.println("*************************************"); 
     316         
    283317         
    284318        if(args.length >= 1) { 
     
    311345        // TODO: generate the tables 
    312346        } 
    313          
     347 
     348        System.out.println("*************************************"); 
    314349    } catch(Exception e) { 
    315350        logger.error(e.getMessage(), e);