Ignore:
Timestamp:
2020-01-31T22:21:40+13:00 (4 years ago)
Author:
ak19
Message:
  1. Previous commit had lots of modifications, and only 2 files matched the simple commit message of clarifications. The code changes in the prev commit were to incorporate the processing of a domains File (of curated sites) and write out all webPages in ach of those sites where isMRI=true. And then calculate a representative sample size n out of N total isMRI webPages, then shuffle that list of isMRI webPages and write out the first n webPage URLs in that list. 1. This commit: incorporating country code alongside URLs as Dr Bainbridge requested.
File:
1 edited

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/WebPageURLsListing.java

    r33883 r33884  
    3030    private File outFolder;
    3131
     32
     33   
     34    public static class Tuple {
     35    public final String url;
     36    public final String countryCode;
     37   
     38    public Tuple(String url, String countryCode) {
     39        this.url = url;
     40        this.countryCode = countryCode;
     41    }
     42
     43    public String toString() {
     44        return this.url + "," + countryCode;
     45    }
     46    }
     47   
    3248   
    3349    public WebPageURLsListing(MongoDBAccess mongodbAccess, File outFolder)
     
    4662   
    4763    public void produceURLsForPagesInMRI(File domainsFile) {
    48     ArrayList<String> urlsList = getURLsForWebPages(MongoDBAccess.IS_MRI, domainsFile);
     64    ArrayList<Tuple> urlsList = getURLsForWebPages(MongoDBAccess.IS_MRI, domainsFile);
    4965    File outFile = new File(outFolder, "isMRI_"+domainsFile.getName());
    5066    writeURLsToFile(urlsList, outFile, urlsList.size());
     
    5571   
    5672    public void produceURLsForPagesContainingMRI(File domainsFile) {
    57     ArrayList<String> urlsList = getURLsForWebPages(MongoDBAccess.CONTAINS_MRI, domainsFile);   
     73    ArrayList<Tuple> urlsList = getURLsForWebPages(MongoDBAccess.CONTAINS_MRI, domainsFile);   
    5874    File outFile = new File(outFolder, "containsMRI_"+domainsFile.getName());
    5975    writeURLsToFile(urlsList, outFile, urlsList.size());
     
    6379    }
    6480   
    65     private ArrayList<String> getURLsForWebPages(int filterType, File domainsFile) {
    66     ArrayList<String> urlsList = new ArrayList<String>();
     81    private ArrayList<Tuple> getURLsForWebPages(int filterType, File domainsFile) {
     82    ArrayList<Tuple> urlsList = new ArrayList<Tuple>();
    6783   
    6884    // 1. read each url from the domainsFile
     
    7995        domain = domain.trim();
    8096        if(!domain.equals("")) {
     97           
     98            String countryCode = "";
     99            int index = domain.lastIndexOf(",");
     100            if(index != -1) {
     101            countryCode = domain.substring(index+1).trim();
     102            domain = domain.substring(0, index);
     103            }
    81104            ArrayList<String> moreURLs = mongodbAccess.queryAllMatchingURLsFilteredBy(domain, filterType);
    82             urlsList.addAll(moreURLs);
     105
     106            // Print out whether there were no isMRI pages for the domain (only containsMRI). A useful thing to know
    83107            if(moreURLs.size() == 0 && filterType == MongoDBAccess.IS_MRI) {
    84             System.out.println("Domain " + domain + " had no isMRI webpages (only containsMRI).");
     108            System.out.println("   " + countryCode + " domain " + domain + " had no isMRI webpages - only containsMRI.");
    85109            }
     110
     111            //urlsList.addAll(moreURLs);
     112            for(int i = 0; i < moreURLs.size(); i++) {
     113            urlsList.add(new Tuple(moreURLs.get(i), countryCode));
     114            }
     115           
    86116        }
    87117        }
     118        System.err.println("");
    88119    } catch(Exception e) {
    89120        logger.error("Unable to read URLs from file " + getFilePath(domainsFile));
     
    112143
    113144    // 0. get a list of all the web pages in the given domain listing where isMRI = true
    114     ArrayList<String> urlsList = getURLsForWebPages(MongoDBAccess.IS_MRI, domainsFile);
     145    ArrayList<Tuple> urlsList = getURLsForWebPages(MongoDBAccess.IS_MRI, domainsFile);
    115146        // produceURLsForPagesInMRI(domainsFile);
    116147   
     
    174205     * Writes out the first n URLs in urlsList into outFile.
    175206     */
    176     private void writeURLsToFile(ArrayList<String> urlsList, File outFile, final int n) {
     207    private void writeURLsToFile(ArrayList<Tuple> urlsList, File outFile, final int n) {
    177208    try (
    178209         Writer writer = new BufferedWriter(new FileWriter(outFile));
     
    180211
    181212        for (int i=0; i < n; i++) {
    182         String url = urlsList.get(i);
     213        Tuple urlInfo = urlsList.get(i);
     214       
    183215        //System.out.println(list.get(i));
    184         writer.write(url + "\n");
     216        writer.write(urlInfo + "\n"); // calls toString() on tuple of url -> countryCode
    185217        }
    186218    } catch(Exception e) {
     
    281313        WebPageURLsListing listing = new WebPageURLsListing(mongodb, outFolder);
    282314
     315        System.out.println("*************************************");
     316       
    283317       
    284318        if(args.length >= 1) {
     
    311345        // TODO: generate the tables
    312346        }
    313        
     347
     348        System.out.println("*************************************");
    314349    } catch(Exception e) {
    315350        logger.error(e.getMessage(), e);
Note: See TracChangeset for help on using the changeset viewer.