Ignore:
Timestamp:
2020-01-29T21:48:52+13:00 (4 years ago)
Author:
ak19
Message:

Some missteps, but have got complex collection.aggregate() working at last.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/WebPageURLsListing.java

    r33873 r33876  
    1818 * TO RUN:
    1919 *    maori-lang-detection/src$
    20  *       java -cp ".:../conf:../lib/*" org/greenstone/atea/WebPageURLsListing ../mongodb-data/domainsNZ_IsMRI.txt  255
     20 *       java -cp ".:../conf:../lib/*" org/greenstone/atea/WebPageURLsListing ../mongodb-data/domainsNZ_IsMRI.txt 255
    2121 *
    2222*/
     
    2525   
    2626    private final MongoDBAccess mongodbAccess;
    27     private final int numURLs;
    28 
    29     //private String[] urls;
    30     ArrayList<String> urlsList = new ArrayList<String>();
     27    private int numURLs;
     28    private File domainsFile;
     29   
     30    public WebPageURLsListing(MongoDBAccess mongodbAccess,
     31                    File domainsFile)
     32    {
     33    this.mongodbAccess = mongodbAccess;
     34    this.domainsFile = domainsFile;
     35    }
    3136   
    3237    public WebPageURLsListing(MongoDBAccess mongodbAccess,
     
    3439                    int numURLs)
    3540    {
    36     this.mongodbAccess = mongodbAccess;
     41    this(mongodbAccess, domainsFile);
    3742    this.numURLs = numURLs;
     43    }
    3844
     45    public String produceURLsForPagesInMRI() {
     46    return writeFile(MongoDBAccess.IS_MRI);
     47    }
     48   
     49    public String produceURLsForPagesContainingMRI() {
     50    return writeFile(MongoDBAccess.CONTAINS_MRI);
     51    }
     52
     53
     54    public String writeFile(int filterType) {
     55
     56    ArrayList<String> urlsList = new ArrayList<String>();
     57   
    3958    // 1. read each url from the domainsFile
    40       // 1a. do the query
    41       // 1b. add the arraylist result to urls
    42 
     59    // 1a. do the query
     60    // 1b. add the arraylist result to urls
     61   
    4362    try (
    4463         BufferedReader reader = new BufferedReader(new FileReader(domainsFile));
     
    5170        domain = domain.trim();
    5271        if(!domain.equals("")) {
    53             ArrayList<String> moreURLs = mongodbAccess.queryAllMatchingIsMRIURLs(domain);
     72            ArrayList<String> moreURLs = mongodbAccess.queryAllMatchingURLsFilteredBy(domain, filterType);
    5473            urlsList.addAll(moreURLs);
    5574        }
     
    6382    // https://stackoverflow.com/questions/5505927/how-to-generate-a-random-permutation-in-java
    6483    File parentFolder = domainsFile.getParentFile();
    65     String fileName = domainsFile.getName();
    66     //File outFile = new File(parentFolder, "random"+numURLs+"_"+fileName);
    67     File fullSetOutFile = new File(parentFolder, "allPages_"+fileName);
     84    //File outFile = new File(parentFolder, "random"+numURLs+"_"+domainsFile.getName());
     85    String fileName = (filterType == MongoDBAccess.IS_MRI) ? "isMRI_" : "containsMRI_";
     86    File outFile = new File(parentFolder, fileName+domainsFile.getName());
    6887
    6988    // write out ALL the URLs
    7089    try (
    71          Writer writer = new BufferedWriter(new FileWriter(fullSetOutFile));
     90         Writer writer = new BufferedWriter(new FileWriter(outFile));
    7291         ) {
    7392
     
    7897        }
    7998    } catch(Exception e) {
    80         logger.error("Unable to write to file " + fullSetOutFile.getAbsolutePath());
     99        logger.error("Unable to write to file " + outFile.getAbsolutePath());
    81100        logger.error(e.getMessage(), e);
    82101    }
     
    98117    }
    99118    */
     119
     120    return outFile.getAbsolutePath();
    100121    }
     122   
    101123   
    102124    public static void printUsage() {
     
    107129    // 1. UNFILTERED: all (NZ + overseas) AND takes manually curated domain list file for overseas and adds all NZ
    108130
    109    
    110131   
    111132    public static void main(String args[]) {
     
    124145        }
    125146
    126         int genNumURLs = Integer.parseInt(args[1]);
     147        //int genNumURLs = Integer.parseInt(args[1]);
    127148
    128149        mongodb.connectToDB();
    129150       
    130151        WebPageURLsListing listing = new WebPageURLsListing(mongodb, domainsFile);
     152        //String isMRIFile = listing.produceURLsForPagesInMRI();
     153        //String containsMRIFile = listing.produceURLsForPagesContainingMRI();
     154        mongodb.aggregateContainsMRIForOverseas();
    131155       
    132156    } catch(Exception e) {
Note: See TracChangeset for help on using the changeset viewer.