Changeset 33882 for other-projects


Ignore:
Timestamp:
2020-01-30T22:54:39+13:00 (4 years ago)
Author:
ak19
Message:

Code now writes both a listing of all non-autotranslated websites and a listing of overseas autotranslated sites.

Location:
other-projects/maori-lang-detection/src/org/greenstone/atea
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java

    r33881 r33882  
    9696    public static final int CONTAINS_MRI = 1;
    9797
     98    /** Some reused fieldnames in the Websites collection */
     99    private static final String FILTER_NUMPAGES_IN_MRI = "numPagesInMRI";
     100    private static final String FILTER_NUMPAGES_CONTAINING_MRI = "numPagesContainingMRI";
     101
    98102    // configuration details, some with fallback values
    99103    private String HOST = "localhost";
     
    402406    ]);
    403407    */
    404     public void aggregateContainsMRIForNZ(Writer writer) throws IOException {
     408    public void aggregateContainsMRIForNZ(Writer writer, int filterType) throws IOException {
    405409    // working with the WebSites collection, not WebPages collection!
    406410    MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);
    407411
    408    
    409     //String isMRI_filter =
     412    String mriFilterString = (filterType == CONTAINS_MRI) ? "{numPagesContainingMRI: {$gt: 0}}" : "{numPagesInMRI: {$gt: 0}}";
    410413   
    411414    Bson orQuery = or(
     
    414417              );
    415418    Bson andQuery = and(
    416         BasicDBObject.parse("{numPagesContainingMRI: {$gt: 0}}"),
     419        BasicDBObject.parse(mriFilterString),
    417420        orQuery);
    418421   
     
    456459    ]);
    457460    */
    458     public void aggregateContainsMRIForOverseas(Writer writer) throws UncheckedIOException {
     461    public void aggregateContainsMRIForOverseas(Writer writer, int filterType,
     462                        boolean isMiInURLPath) throws UncheckedIOException
     463    {
    459464    // working with the WebSites collection, not WebPages collection!
    460465    MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);
    461 
     466   
     467    String mriFilterString = (filterType == CONTAINS_MRI) ? "{numPagesContainingMRI: {$gt: 0}}" : "{numPagesInMRI: {$gt: 0}}";
    462468   
    463469    Bson orQuery = or(
    464470              BasicDBObject.parse("{geoLocationCountryCode: \"AU\"}"),
    465               BasicDBObject.parse("{urlContainsLangCodeInPath: false}")
     471              BasicDBObject.parse("{urlContainsLangCodeInPath: "+ isMiInURLPath +"}")
     472              // e.g. "{urlContainsLangCodeInPath: false}"
    466473              );
    467474    Bson andQuery = and(
    468475        BasicDBObject.parse("{geoLocationCountryCode: {$ne: \"NZ\"}}"),
    469476        BasicDBObject.parse("{domain: {$not: /\\.nz/}}"),
    470         BasicDBObject.parse("{numPagesContainingMRI: {$gt: 0}}"),
     477        BasicDBObject.parse(mriFilterString),
    471478        orQuery);
    472479
  • other-projects/maori-lang-detection/src/org/greenstone/atea/WebPageURLsListing.java

    r33880 r33882  
    2525   
    2626    private final MongoDBAccess mongodbAccess;
    27     private int numURLs;
    28     private File domainsFile;
    29 
    30 
    31    
    32     public WebPageURLsListing(MongoDBAccess mongodbAccess,
    33                     File domainsFile)
     27    private File outFolder;
     28
     29    public WebPageURLsListing(MongoDBAccess mongodbAccess, File outFolder)
    3430    {
    3531    this.mongodbAccess = mongodbAccess;
    36     this.domainsFile = domainsFile;
    37     }
    38    
    39     public WebPageURLsListing(MongoDBAccess mongodbAccess,
    40                     File domainsFile,
    41                     int numURLs)
    42     {
    43     this(mongodbAccess, domainsFile);
    44     this.numURLs = numURLs;
    45     }
    46 
    47     public String produceURLsForPagesInMRI() {
    48     return writeFile(MongoDBAccess.IS_MRI);
    49     }
    50    
    51     public String produceURLsForPagesContainingMRI() {
    52     return writeFile(MongoDBAccess.CONTAINS_MRI);
    53     }
    54 
    55 
    56     public String writeFile(int filterType) {
     32    this.outFolder = outFolder;
     33    }
     34   
     35    public String produceURLsForPagesInMRI(File domainsFile) {
     36    return writeFile(MongoDBAccess.IS_MRI, domainsFile);
     37    }
     38   
     39    public String produceURLsForPagesContainingMRI(File domainsFile) {
     40    return writeFile(MongoDBAccess.CONTAINS_MRI, domainsFile);
     41    }
     42
     43
     44    public String writeFile(int filterType, File domainsFile/*, int numURLs*/) {
    5745
    5846    ArrayList<String> urlsList = new ArrayList<String>();
     
    126114
    127115    /**
    128      * Create the file
     116     * Create the file 5counts_tentativeNonAutotranslatedSites.json
     117     * that contains the count and domains for NZ sites (NZ origin or nz TLD) that CONTAIN_MRI
     118     * followed by counts and domain listing for overseas sites that are either from Australia
     119     * or don't contain mi in their URL path.
    129120     * @return full path of file generated
    130121     */
    131122    public String writeTentativeNonAutotranslatedSites() {
    132     File outFolder = new File("../mongodb-data/").getAbsoluteFile();
    133     File outFile = new File(outFolder, "5counts_tentativeNonAutotranslatedSites.json");
     123   
     124    File outFile = new File(outFolder, "5a_counts_tentativeNonAutotranslatedSites.json");
    134125
    135126    String filename = outFile.getAbsolutePath();
     
    139130         ) {
    140131        // first write out NZ sites and .nz TLD count and domains
    141         mongodbAccess.aggregateContainsMRIForNZ(writer);
    142         // next write out all overseas sites and .nz TLD count and domains
    143         mongodbAccess.aggregateContainsMRIForOverseas(writer);
     132        mongodbAccess.aggregateContainsMRIForNZ(writer, MongoDBAccess.CONTAINS_MRI);
     133        // next write out all overseas sites (not NZ origin or .nz TLD)
     134        // that have no "mi" in the URL path as mi.* or */mi
     135        boolean isMiInURLPath = false;
     136        mongodbAccess.aggregateContainsMRIForOverseas(writer, MongoDBAccess.CONTAINS_MRI, isMiInURLPath);
    144137
    145138        filename = outFile.getCanonicalPath();
     
    148141        logger.error(e.getMessage(), e);
    149142    }
     143   
     144    System.err.println("*** Wrote file: " + filename);
    150145
    151146    return filename;
    152147    }
    153    
     148
     149    /**
     150     * Listing of the remainder of overseas sites that CONTAIN_MRI not included by
     151     * writeTentativeNonAutotranslatedSites(): those that have mi in their URL path
     152     */
     153    public String writeOverseasSitesWithMiInURLPath() {
     154    File outFile = new File(outFolder, "5b_counts_overseasSitesWithMiInPath.json");
     155
     156    String filename = outFile.getAbsolutePath();
     157    try (
     158         Writer writer = new BufferedWriter(new FileWriter(outFile));
     159         ) {
     160        boolean isMiInURLPath = true;
     161        mongodbAccess.aggregateContainsMRIForOverseas(writer, MongoDBAccess.CONTAINS_MRI, isMiInURLPath);
     162        filename = outFile.getCanonicalPath();
     163    } catch(Exception e) {
     164        logger.error("Unable to write to file " + outFile.getAbsolutePath());
     165        logger.error(e.getMessage(), e);
     166    }
     167
     168    System.err.println("*** Wrote file: " + filename);
     169    return filename;
     170    }
    154171   
    155172    public static void printUsage() {
     
    170187         MongoDBAccess mongodb = new MongoDBAccess();
    171188         ) {
    172         File domainsFile = new File(args[0]);
    173         if(!domainsFile.exists()) {
    174         System.err.println("File " + domainsFile + " does not exist");
    175         System.exit(-1);
    176         }
    177 
    178         //int genNumURLs = Integer.parseInt(args[1]);
    179189
    180190        mongodb.connectToDB();
    181        
    182         WebPageURLsListing listing = new WebPageURLsListing(mongodb, domainsFile);
    183         //String isMRIFile = listing.produceURLsForPagesInMRI();
    184         //String containsMRIFile = listing.produceURLsForPagesContainingMRI();
    185         String filename = listing.writeTentativeNonAutotranslatedSites();
    186         System.err.println("Check file: " + filename);
     191
     192        // output files will be stored in mongodb-data-auto
     193        File outFolder = new File("../mongodb-data-auto/").getAbsoluteFile();
     194        WebPageURLsListing listing = new WebPageURLsListing(mongodb, outFolder);
     195
     196       
     197        if(args.length >= 1) {
     198        File domainsFile = new File(args[0]);
     199        if(!domainsFile.exists()) {
     200            System.err.println("File " + domainsFile + " does not exist");
     201            System.exit(-1);
     202        }
     203
     204       
     205        //int genNumURLs = Integer.parseInt(args[1]);
     206        //String isMRIFile = listing.produceURLsForPagesInMRI(domainsFile);
     207        //String containsMRIFile = listing.produceURLsForPagesContainingMRI(domainsFile);
     208
     209        //listing.writeWebPagesOfAllNZSitesAndDomainListing();
     210       
     211        } else {       
     212
     213        String filename = listing.writeTentativeNonAutotranslatedSites();
     214        filename = listing.writeOverseasSitesWithMiInURLPath();
     215        }
    187216
    188217       
Note: See TracChangeset for help on using the changeset viewer.