Changeset 33882

Show
Ignore:
Timestamp:
30.01.2020 22:54:39 (3 weeks ago)
Author:
ak19
Message:

Code now writes both a listing of all non-autotranslated websites and a listing of overseas autotranslated sites.

Location:
other-projects/maori-lang-detection/src/org/greenstone/atea
Files:
2 modified

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java

    r33881 r33882  
    9696    public static final int CONTAINS_MRI = 1; 
    9797 
     98    /** Some reused fieldnames in the Websites collection */ 
     99    private static final String FILTER_NUMPAGES_IN_MRI = "numPagesInMRI"; 
     100    private static final String FILTER_NUMPAGES_CONTAINING_MRI = "numPagesContainingMRI"; 
     101 
    98102    // configuration details, some with fallback values 
    99103    private String HOST = "localhost"; 
     
    402406    ]); 
    403407    */ 
    404     public void aggregateContainsMRIForNZ(Writer writer) throws IOException { 
     408    public void aggregateContainsMRIForNZ(Writer writer, int filterType) throws IOException { 
    405409    // working with the WebSites collection, not WebPages collection! 
    406410    MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION); 
    407411 
    408      
    409     //String isMRI_filter =  
     412    String mriFilterString = (filterType == CONTAINS_MRI) ? "{numPagesContainingMRI: {$gt: 0}}" : "{numPagesInMRI: {$gt: 0}}"; 
    410413     
    411414    Bson orQuery = or( 
     
    414417              ); 
    415418    Bson andQuery = and( 
    416         BasicDBObject.parse("{numPagesContainingMRI: {$gt: 0}}"), 
     419        BasicDBObject.parse(mriFilterString), 
    417420        orQuery); 
    418421     
     
    456459    ]); 
    457460    */ 
    458     public void aggregateContainsMRIForOverseas(Writer writer) throws UncheckedIOException { 
     461    public void aggregateContainsMRIForOverseas(Writer writer, int filterType, 
     462                        boolean isMiInURLPath) throws UncheckedIOException 
     463    { 
    459464    // working with the WebSites collection, not WebPages collection! 
    460465    MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION); 
    461  
     466     
     467    String mriFilterString = (filterType == CONTAINS_MRI) ? "{numPagesContainingMRI: {$gt: 0}}" : "{numPagesInMRI: {$gt: 0}}"; 
    462468     
    463469    Bson orQuery = or( 
    464470              BasicDBObject.parse("{geoLocationCountryCode: \"AU\"}"), 
    465               BasicDBObject.parse("{urlContainsLangCodeInPath: false}") 
     471              BasicDBObject.parse("{urlContainsLangCodeInPath: "+ isMiInURLPath +"}") 
     472              // e.g. "{urlContainsLangCodeInPath: false}" 
    466473              ); 
    467474    Bson andQuery = and( 
    468475        BasicDBObject.parse("{geoLocationCountryCode: {$ne: \"NZ\"}}"), 
    469476        BasicDBObject.parse("{domain: {$not: /\\.nz/}}"), 
    470         BasicDBObject.parse("{numPagesContainingMRI: {$gt: 0}}"), 
     477        BasicDBObject.parse(mriFilterString), 
    471478        orQuery); 
    472479 
  • other-projects/maori-lang-detection/src/org/greenstone/atea/WebPageURLsListing.java

    r33880 r33882  
    2525     
    2626    private final MongoDBAccess mongodbAccess; 
    27     private int numURLs; 
    28     private File domainsFile; 
    29  
    30  
    31      
    32     public WebPageURLsListing(MongoDBAccess mongodbAccess, 
    33                     File domainsFile) 
     27    private File outFolder; 
     28 
     29    public WebPageURLsListing(MongoDBAccess mongodbAccess, File outFolder) 
    3430    { 
    3531    this.mongodbAccess = mongodbAccess; 
    36     this.domainsFile = domainsFile; 
    37     } 
    38      
    39     public WebPageURLsListing(MongoDBAccess mongodbAccess, 
    40                     File domainsFile, 
    41                     int numURLs) 
    42     { 
    43     this(mongodbAccess, domainsFile); 
    44     this.numURLs = numURLs; 
    45     } 
    46  
    47     public String produceURLsForPagesInMRI() { 
    48     return writeFile(MongoDBAccess.IS_MRI); 
    49     } 
    50      
    51     public String produceURLsForPagesContainingMRI() { 
    52     return writeFile(MongoDBAccess.CONTAINS_MRI); 
    53     } 
    54  
    55  
    56     public String writeFile(int filterType) { 
     32    this.outFolder = outFolder; 
     33    } 
     34     
     35    public String produceURLsForPagesInMRI(File domainsFile) { 
     36    return writeFile(MongoDBAccess.IS_MRI, domainsFile); 
     37    } 
     38     
     39    public String produceURLsForPagesContainingMRI(File domainsFile) { 
     40    return writeFile(MongoDBAccess.CONTAINS_MRI, domainsFile); 
     41    } 
     42 
     43 
     44    public String writeFile(int filterType, File domainsFile/*, int numURLs*/) { 
    5745 
    5846    ArrayList<String> urlsList = new ArrayList<String>(); 
     
    126114 
    127115    /**  
    128      * Create the file  
     116     * Create the file 5counts_tentativeNonAutotranslatedSites.json 
     117     * that contains the count and domains for NZ sites (NZ origin or nz TLD) that CONTAIN_MRI 
     118     * followed by counts and domain listing for overseas sites that are either from Australia 
     119     * or don't contain mi in their URL path. 
    129120     * @return full path of file generated 
    130121     */ 
    131122    public String writeTentativeNonAutotranslatedSites() { 
    132     File outFolder = new File("../mongodb-data/").getAbsoluteFile(); 
    133     File outFile = new File(outFolder, "5counts_tentativeNonAutotranslatedSites.json"); 
     123     
     124    File outFile = new File(outFolder, "5a_counts_tentativeNonAutotranslatedSites.json"); 
    134125 
    135126    String filename = outFile.getAbsolutePath(); 
     
    139130         ) { 
    140131        // first write out NZ sites and .nz TLD count and domains 
    141         mongodbAccess.aggregateContainsMRIForNZ(writer); 
    142         // next write out all overseas sites and .nz TLD count and domains 
    143         mongodbAccess.aggregateContainsMRIForOverseas(writer); 
     132        mongodbAccess.aggregateContainsMRIForNZ(writer, MongoDBAccess.CONTAINS_MRI); 
     133        // next write out all overseas sites (not NZ origin or .nz TLD) 
     134        // that have no "mi" in the URL path as mi.* or */mi 
     135        boolean isMiInURLPath = false; 
     136        mongodbAccess.aggregateContainsMRIForOverseas(writer, MongoDBAccess.CONTAINS_MRI, isMiInURLPath); 
    144137 
    145138        filename = outFile.getCanonicalPath(); 
     
    148141        logger.error(e.getMessage(), e); 
    149142    } 
     143     
     144    System.err.println("*** Wrote file: " + filename); 
    150145 
    151146    return filename; 
    152147    } 
    153      
     148 
     149    /** 
     150     * Listing of the remainder of overseas sites that CONTAIN_MRI not included by 
     151     * writeTentativeNonAutotranslatedSites(): those that have mi in their URL path 
     152     */ 
     153    public String writeOverseasSitesWithMiInURLPath() { 
     154    File outFile = new File(outFolder, "5b_counts_overseasSitesWithMiInPath.json"); 
     155 
     156    String filename = outFile.getAbsolutePath(); 
     157    try ( 
     158         Writer writer = new BufferedWriter(new FileWriter(outFile)); 
     159         ) { 
     160        boolean isMiInURLPath = true; 
     161        mongodbAccess.aggregateContainsMRIForOverseas(writer, MongoDBAccess.CONTAINS_MRI, isMiInURLPath); 
     162        filename = outFile.getCanonicalPath(); 
     163    } catch(Exception e) { 
     164        logger.error("Unable to write to file " + outFile.getAbsolutePath()); 
     165        logger.error(e.getMessage(), e); 
     166    } 
     167 
     168    System.err.println("*** Wrote file: " + filename); 
     169    return filename; 
     170    } 
    154171     
    155172    public static void printUsage() { 
     
    170187         MongoDBAccess mongodb = new MongoDBAccess(); 
    171188         ) { 
    172         File domainsFile = new File(args[0]); 
    173         if(!domainsFile.exists()) { 
    174         System.err.println("File " + domainsFile + " does not exist"); 
    175         System.exit(-1); 
    176         } 
    177  
    178         //int genNumURLs = Integer.parseInt(args[1]); 
    179189 
    180190        mongodb.connectToDB(); 
    181          
    182         WebPageURLsListing listing = new WebPageURLsListing(mongodb, domainsFile); 
    183         //String isMRIFile = listing.produceURLsForPagesInMRI(); 
    184         //String containsMRIFile = listing.produceURLsForPagesContainingMRI(); 
    185         String filename = listing.writeTentativeNonAutotranslatedSites(); 
    186         System.err.println("Check file: " + filename); 
     191 
     192        // output files will be stored in mongodb-data-auto 
     193        File outFolder = new File("../mongodb-data-auto/").getAbsoluteFile(); 
     194        WebPageURLsListing listing = new WebPageURLsListing(mongodb, outFolder); 
     195 
     196         
     197        if(args.length >= 1) { 
     198        File domainsFile = new File(args[0]); 
     199        if(!domainsFile.exists()) { 
     200            System.err.println("File " + domainsFile + " does not exist"); 
     201            System.exit(-1); 
     202        } 
     203 
     204         
     205        //int genNumURLs = Integer.parseInt(args[1]); 
     206        //String isMRIFile = listing.produceURLsForPagesInMRI(domainsFile); 
     207        //String containsMRIFile = listing.produceURLsForPagesContainingMRI(domainsFile); 
     208 
     209        //listing.writeWebPagesOfAllNZSitesAndDomainListing(); 
     210         
     211        } else {         
     212 
     213        String filename = listing.writeTentativeNonAutotranslatedSites(); 
     214        filename = listing.writeOverseasSitesWithMiInURLPath(); 
     215        } 
    187216 
    188217