Changeset 33909

Show
Ignore:
Timestamp:
12.02.2020 19:02:44 (6 days ago)
Author:
ak19
Message:

1. Implementing tables 3 to 5. 2. Rolled back the introduction of the basicDomain field (domain stripped of http/https and www prefixes) as the code can create and sort this field alphabetically, whereas it didn't sort properly in mongodb. 3. The code now does sort the domains stripped of protocol and www for the mongodb queries producing domain results and ensures the domain list is unique. 4. Split the MongoDBAccess class into 2, with the connection code in MongoDBAccess.java and the querying code in MongoDBQueryer (a subclass of MongoDBAccess) that is so far exclusively used by WebPageURLsListing.java

Location:
other-projects/maori-lang-detection/src/org/greenstone/atea
Files:
4 modified

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java

    r33906 r33909  
    2828import org.bson.BsonArray; 
    2929import org.bson.BsonString; 
     30import org.bson.BsonValue; 
    3031import org.bson.Document; 
    3132import org.bson.conversions.Bson; 
     
    5253import java.util.List; 
    5354import java.util.Properties; 
     55import java.util.TreeSet; 
    5456import java.util.regex.Pattern; 
    5557 
     
    98100    public static final String WEBSITES_COLLECTION = "Websites"; 
    99101 
    100     public static final String NEWLINE = System.getProperty("line.separator"); 
    101      
    102     /** mongodb filter types to execute */ 
    103     public static final int IS_MRI = 0; 
    104     public static final int CONTAINS_MRI = 1; 
    105  
    106     /** Some reused fieldnames in the Websites collection */ 
    107     private static final String FILTER_NUMPAGES_IN_MRI = "numPagesInMRI"; 
    108     private static final String FILTER_NUMPAGES_CONTAINING_MRI = "numPagesContainingMRI"; 
    109  
     102     
    110103    // configuration details, some with fallback values 
    111     private String HOST = "localhost"; 
    112     private int PORT = 27017; // mongodb port 
    113     private String USERNAME; 
    114     private String PASSWORD;     
    115     private String DB_NAME ="ateacrawldata"; 
    116      
    117     private MongoClient mongo = null; 
    118     private MongoDatabase database = null; 
     104    protected String HOST = "localhost"; 
     105    protected int PORT = 27017; // mongodb port 
     106    protected String USERNAME; 
     107    protected String PASSWORD;     
     108    protected String DB_NAME ="ateacrawldata"; 
     109     
     110    protected MongoClient mongo = null; 
     111    protected MongoDatabase database = null; 
    119112 
    120113    /**  
     
    210203        System.err.println("coll: " + coll); 
    211204    } 
     205    } 
     206 
     207    protected MongoCollection<Document> getWebpagesCollection() { 
     208    return this.database.getCollection(WEBPAGES_COLLECTION); 
     209    } 
     210    protected MongoCollection<Document> getWebsitesCollection() { 
     211    return this.database.getCollection(WEBSITES_COLLECTION); 
    212212    } 
    213213     
     
    219219        .append("siteFolderName", website.siteFolderName)  
    220220        .append("domain", website.domain)  
    221         .append("basicDomain", website.basicDomain)  
    222221        .append("totalPages", website.totalPages) 
    223222        .append("numPagesWithBodyText", website.countOfWebPagesWithBodyText) 
     
    303302    } 
    304303    */ 
    305      
    306     public ArrayList<String> queryAllMatchingIsMRIURLs(String domain) { 
    307     return queryAllMatchingURLsFilteredBy(domain, IS_MRI); 
    308     } 
    309     public ArrayList<String> queryAllMatchingcontainsMRIURLs(String domain) { 
    310     return queryAllMatchingURLsFilteredBy(domain, CONTAINS_MRI); 
    311     } 
    312      
    313     /**  
    314      * Java mongodb find: https://mongodb.github.io/mongo-java-driver/3.4/driver/getting-started/quick-start/  
    315      * Java mongodb find filters: https://mongodb.github.io/mongo-java-driver/3.4/javadoc/?com/mongodb/client/model/Filters.html 
    316      * Java mongodb projection: https://stackoverflow.com/questions/44894497/retrieving-data-with-mongodb-java-driver-3-4-using-find-method-with-projection 
    317      * mongodb projection: https://docs.mongodb.com/v3.2/reference/method/db.collection.find/#db.collection.find 
    318      * 
    319      * Parse MongoDB query into Java: https://stackoverflow.com/questions/17326747/parsing-strings-to-mongodb-query-documents-with-operators-in-java 
    320      * Maybe also https://stackoverflow.com/questions/48000891/parse-mongodb-json-query-in-java-with-multiple-criteria 
    321      * https://stackoverflow.com/questions/55029222/parse-mongodb-query-to-java 
    322      * http://pingax.com/trick-convert-mongo-shell-query-equivalent-java-objects/ 
    323      */ 
    324     public ArrayList<String> queryAllMatchingURLsFilteredBy(String domain, int filterType) { 
    325      
    326     final ArrayList<String> urlsList = new ArrayList<String>(); 
    327      
    328     // remove any http(s)://(www.) from the start of URL first 
    329     // since it goes into a regex 
    330     domain = Utility.stripProtocolAndWWWFromURL(domain); 
    331  
    332     // load the "webpages" db table 
    333     // in mongodb, the equivalent of db tables are called 'collections'  
    334     MongoCollection<Document> collection = this.database.getCollection(WEBPAGES_COLLECTION); 
    335  
    336     // code we'll execute in Iterable.forEach() below 
    337     // see also https://www.baeldung.com/foreach-java 
    338     Block<Document> storeURL = new Block<Document>() { 
    339         @Override 
    340         public void apply(final Document document) { 
    341             //System.out.println(document.toJson()); 
    342             String url = document.getString("URL"); 
    343             // add to our urlsList 
    344             //System.out.println(url); 
    345             urlsList.add(url); 
    346         } 
    347         }; 
    348  
    349      
    350     // Run the following mongodb query: 
    351     //    db.getCollection('Webpages').find({URL: /domain/, isMRI: true}, {URL: 1, _id: 0}) 
    352      
    353     // 1. One way that works: 
    354     //collection.find(and(eq("isMRI", true), regex("URL", pattern))).projection(fields(include("URL"), excludeId())).forEach(storeURL); 
    355  
    356     // 2. Another way: 
    357     //String query = "{URL: /DOMAIN/, isMRI: true}"; 
    358     String query = "{URL: /DOMAIN/, "; 
    359     if(filterType == IS_MRI) { 
    360         query += "isMRI: true}"; 
    361     } else if(filterType == CONTAINS_MRI) { 
    362         query += "containsMRI: true}"; 
    363     } 
    364      
    365     domain = domain.replace(".", "\\."); // escape dots in domain for regex 
    366     query = query.replace("DOMAIN", domain); 
    367  
    368     //System.err.println("Executing find query: " + query); 
    369      
    370     BasicDBObject findObj = BasicDBObject.parse(query); 
    371     BasicDBObject projectionObj = BasicDBObject.parse("{URL: 1, _id: 0}"); 
    372      
    373  
    374     collection.find(findObj).projection(projectionObj).forEach(storeURL); 
    375      
    376     return urlsList; 
    377     } 
    378  
    379     /**      
    380      * RUNNING A MONGODB COLLECTION.AGGREGATE() in JAVA: 
    381      * 
    382      * https://stackoverflow.com/questions/31643109/mongodb-aggregation-with-java-driver 
    383      * https://stackoverflow.com/questions/48000891/parse-mongodb-json-query-in-java-with-multiple-criteria 
    384      * Not Java: https://stackoverflow.com/questions/39060221/a-pipeline-stage-specification-object-must-contain-exactly-one-field-with-php-mo 
    385      * 
    386      * (https://stackoverflow.com/questions/55029222/parse-mongodb-query-to-java) 
    387      * https://www.programcreek.com/java-api-examples/?api=com.mongodb.client.model.Aggregates 
    388      * On using group(TExpression) inside collection.aggregate(). 
    389      * 
    390      *  For forEach lamba expressions, see also https://www.baeldung.com/foreach-java 
    391      *  and https://www.javatpoint.com/java-8-foreach 
    392      *  and https://stackoverflow.com/questions/47979978/ambiguous-reference-to-foreach-when-listing-mongodbs-database-in-java 
    393      * 
    394      * Count by country code of non-NZ websites containing a positive number of sentences in MRI, 
    395      * listing all the base domain strings (no protocol or www) in ALPHABETICAL ORDER 
    396      * and total counts of numPagesInMRI and numPagesContainingMRI across all these 
    397      * matching sites. 
    398      *  
    399      * The mongodb aggregate() we want to run this time: 
    400      * 
    401        db.Websites.aggregate([ 
    402        { 
    403         $match: { 
    404             $and: [ 
    405                 {numPagesContainingMRI: {$gt: 0}}, 
    406                 {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz/}]} 
    407             ] 
    408           } 
    409     }, 
    410     { $unwind: "$geoLocationCountryCode" }, 
    411     { 
    412           $group: { 
    413             _id: "nz", 
    414             count: { $sum: 1 },             
    415         domain: { $addToSet: '$basicDomain' } // domain: {$push: "$basicDomain" } 
    416           } 
    417     }, 
    418     { $sort : { count : -1} } 
    419     ]); 
    420     */ 
    421     public void aggregateContainsMRIForNZ(Writer writer, int filterType) throws IOException { 
    422     // working with the WebSites collection, not WebPages collection! 
    423     MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION); 
    424  
    425     String mriFilterString = (filterType == CONTAINS_MRI) ? "{numPagesContainingMRI: {$gt: 0}}" : "{numPagesInMRI: {$gt: 0}}"; 
    426      
    427     Bson orQuery = or( 
    428               BasicDBObject.parse("{geoLocationCountryCode: \"NZ\"}"), 
    429               BasicDBObject.parse("{domain: /\\.nz/}") 
    430               ); 
    431     Bson andQuery = and( 
    432         BasicDBObject.parse(mriFilterString), 
    433         orQuery); 
    434      
    435     // Hopefully the lambda expression (forEach()) at end means 
    436     // we write out each result Document as we get it 
    437     collection.aggregate(Arrays.asList( 
    438          match(andQuery), 
    439          unwind("$geoLocationCountryCode"), 
    440          group("NZ", Arrays.asList(sum("count", 1), 
    441                    addToSet("domain", "$basicDomain"))), 
    442          sort(BasicDBObject.parse("{count : -1}")) 
    443      )).forEach((Block<Document>)doc -> writeDoc(doc, writer)); 
    444  
    445     // should only have one doc for NZ since it's a count by geolocation. 
    446  
    447     return; 
    448     } 
    449      
    450     /** 
    451      * Count of NZ (incl .nz TLD)  websites containing a positive number of sentences in MRI,  
    452      * listing all the base domain strings (no protocol or www) in ALPHABETICAL ORDER 
    453      * and total counts of numPagesInMRI and numPagesContainingMRI across all these 
    454      * matching sites. 
    455      * 
    456      * The aggregate() we want to run this time: 
    457      * 
    458        db.Websites.aggregate([ 
    459        { 
    460          $match: { 
    461             $and: [ 
    462                 {geoLocationCountryCode: {$ne: "NZ"}}, 
    463                 {domain: {$not: /\.nz/}}, 
    464                 {numPagesContainingMRI: {$gt: 0}}, 
    465                 {$or: [{geoLocationCountryCode: "AU"}, {urlContainsLangCodeInPath: false}]}             
    466             ] 
    467       } 
    468     }, 
    469     { $unwind: "$geoLocationCountryCode" }, 
    470     { 
    471           $group: { 
    472             _id: {$toLower: '$geoLocationCountryCode'}, 
    473             count: { $sum: 1 }, 
    474         domain: { $addToSet: '$basicDomain' } // domain: {$push: "$basicDomain" } 
    475           } 
    476      }, 
    477      { $sort : { count : -1} } 
    478     ]); 
    479     */ 
    480     public void aggregateContainsMRIForOverseas(Writer writer, int filterType, 
    481                         boolean isMiInURLPath) throws UncheckedIOException 
    482     { 
    483     // working with the WebSites collection, not WebPages collection! 
    484     MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION); 
    485      
    486     String mriFilterString = (filterType == CONTAINS_MRI) ? "{numPagesContainingMRI: {$gt: 0}}" : "{numPagesInMRI: {$gt: 0}}"; 
    487      
    488     Bson orQuery = or( 
    489               BasicDBObject.parse("{geoLocationCountryCode: \"AU\"}"), 
    490               BasicDBObject.parse("{urlContainsLangCodeInPath: "+ isMiInURLPath +"}") 
    491               // e.g. "{urlContainsLangCodeInPath: false}" 
    492               ); 
    493     Bson andQuery = and( 
    494         BasicDBObject.parse("{geoLocationCountryCode: {$ne: \"NZ\"}}"), 
    495         BasicDBObject.parse("{domain: {$not: /\\.nz/}}"), 
    496         BasicDBObject.parse(mriFilterString), 
    497         orQuery); 
    498  
    499     collection.aggregate(Arrays.asList( 
    500          match(andQuery),  //match(BasicDBObject.parse(matchQuery)) 
    501          // match((List<DBObject>)JSON.parse(matchQuery)), 
    502          unwind("$geoLocationCountryCode"), 
    503          group("$geoLocationCountryCode", Arrays.asList(sum("count", 1),  
    504                         addToSet("domain", "$basicDomain"))), 
    505          sort(BasicDBObject.parse("{count : -1}")) 
    506        )).forEach((Block<Document>)doc -> writeDoc(doc, writer)); 
    507  
    508     // casting to Block<Document> necessary because otherwise we see the error at 
    509     // https://stackoverflow.com/questions/47979978/ambiguous-reference-to-foreach-when-listing-mongodbs-database-in-java 
    510  
    511     // Less efficient way is to keep all the results in memory and then 
    512     // write them out one at a time 
    513     /* 
    514     AggregateIterable<Document> output 
    515         = collection.aggregate(Arrays.asList( 
    516          match(andQuery),  //match(BasicDBObject.parse(matchQuery)) 
    517          // match((List<DBObject>)JSON.parse(matchQuery)), 
    518          unwind("$geoLocationCountryCode"), 
    519          group("$geoLocationCountryCode", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))), 
    520          sort(BasicDBObject.parse("{count : -1}")) 
    521      )); 
    522  
    523      
    524     for (Document doc : output) { 
    525         //System.out.println(doc); 
    526         System.out.println(doc.toJson()); 
    527          
    528     } 
    529     */ 
    530     return; 
    531     } 
    532  
    533     /** Do the aggregates for writing out tables. 
    534        Table1: 
    535         
    536     */ 
    537     public void writeTables(File outFolder) { 
    538     // In this function, we're always dealing with the Websites mongodb collection. 
    539     MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION); 
    540  
    541     String[] tableNames = { "", "1table_allCrawledSites", "2table_sitesWithPagesInMRI"}; 
    542     for (int tableNum = 1; tableNum < tableNames.length; tableNum++) { 
    543         File outFile = new File(outFolder, tableNames[tableNum] + ".json"); 
    544         File csvFile = new File(outFolder, tableNames[tableNum] + ".csv"); 
    545         try ( 
    546          Writer writer = new BufferedWriter(new FileWriter(outFile)); 
    547          CSVPrinter csvWriter = new CSVPrinter(new FileWriter(csvFile), CSVFormat.DEFAULT); 
    548          ) { 
    549  
    550         // Write out the CSV column headings 
    551         // https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVPrinter.html 
    552         csvWriter.printRecord("countryCode", "siteCount", 
    553               "numPagesInMRI count","numPagesContainingMRICount"/*, "domain"*/); 
    554          
    555         AggregateIterable<Document> output = getTable(collection, tableNum); //doTable1().forEach((Block<Document>)doc -> writeDoc(doc, writer)); 
    556          
    557         int docNum = 0; 
    558         for (Document doc : output) {        
    559             //System.out.println(doc); 
    560             writeDocAsJsonRecord(++docNum, doc, writer); 
    561             writeDocAsCSVRecord(++docNum, doc, csvWriter); 
    562         }        
    563         logger.info("@@@ Wrote out table into file: " + Utility.getFilePath(outFile) + " and .csv"); 
    564         } catch(UncheckedIOException ioe) { 
    565         logger.error("Caught UncheckedIOException: " + ioe.getMessage(), ioe); 
    566         } 
    567         catch(Exception e) { 
    568         logger.error("Could not write table to file " + outFile + " or .csv equivalent" , e); 
    569         } 
    570     } 
    571     } 
    572  
    573     public AggregateIterable<Document> getTable(MongoCollection<Document> collection, int tableNum) { 
    574      
    575     AggregateIterable<Document> output = null; 
    576  
    577     switch(tableNum) { 
    578          
    579     case 1: 
    580         /* 1table_allCrawledSites - 
    581  
    582            db.Websites.aggregate([        
    583            { $unwind: "$geoLocationCountryCode" }, 
    584            { 
    585            $group: { 
    586            _id: "$geoLocationCountryCode", 
    587            count: { $sum: 1 }, 
    588            //domain: { $addToSet: '$domain' }, 
    589            numPagesInMRICount: { $sum: '$numPagesInMRI' }, 
    590            numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' } 
    591            } 
    592            }, 
    593            { $sort : { count : -1} } 
    594            ]); 
    595          */ 
    596         output = collection.aggregate(Arrays.asList( 
    597                            //match(BasicDBObject.parse("{urlContainsLangCodeInPath:true}")), 
    598          unwind("$geoLocationCountryCode"), 
    599          group("$geoLocationCountryCode", Arrays.asList( 
    600                                 sum("count", 1), 
    601                                 /*addToSet("domain", "$domain"),*/ 
    602                                 sum("numPagesInMRICount", "$numPagesInMRI"), 
    603                                 sum("numPagesContainingMRICount", "$numPagesContainingMRI"))), 
    604          sort(BasicDBObject.parse("{count : -1}"))        
    605         )); 
    606         break; 
    607  
    608     case 2: 
    609         /* 
    610           db.Websites.aggregate([ 
    611           { $match: { numPagesInMRI: {$gt: 0} } }, 
    612           { $unwind: "$geoLocationCountryCode" }, 
    613           { 
    614           $group: { 
    615           _id: {$toLower: '$geoLocationCountryCode'}, // ignore toLower 
    616           count: { $sum: 1 }, 
    617           //domain: { $addToSet: '$domain' }, 
    618           numPagesInMRICount: { $sum: '$numPagesInMRI' }, 
    619           numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' } 
    620           } 
    621           }, 
    622           { $sort : { count : -1} } 
    623           ]); 
    624          */ 
    625          output = collection.aggregate(Arrays.asList( 
    626                            match(BasicDBObject.parse("{ numPagesInMRI: {$gt: 0} }")), 
    627          unwind("$geoLocationCountryCode"), 
    628          group("$geoLocationCountryCode", Arrays.asList( 
    629                                 sum("count", 1), 
    630                                 /*addToSet("domain", "$domain"),*/ 
    631                                 sum("numPagesInMRICount", "$numPagesInMRI"), 
    632                                 sum("numPagesContainingMRICount", "$numPagesContainingMRI"))), 
    633          sort(BasicDBObject.parse("{count : -1}"))        
    634         )); 
    635         break; 
    636          
    637     default: logger.error("Unknown table number: " + tableNum); 
    638          
    639     } 
    640  
    641      return output; 
    642      
    643     } 
    644      
    645      
    646      
    647     /** 
    648      * called by lambda forEach() call on Document objects to write them out to a file. 
    649      * Have to deal with unreported exceptions here that can't be dealt with when doing 
    650      * the actual forEach(). See 
    651      * https://stackoverflow.com/questions/39090292/how-to-cleanly-deal-with-unreported-exception-ioexception-in-stream-foreach 
    652      */     
    653     public void writeDoc(Document doc, Writer writer) throws UncheckedIOException { 
    654  
    655     // If there's a domain field in the json Doc, sort this domain listing alphabetically 
    656     Object domainList = doc.remove("domain"); 
    657     if(domainList != null) { 
    658         doc.put("domain", sortAlphabetically(domainList)); 
    659     } 
    660      
    661     //OLD WAY: writer.write(doc.toJson(new JsonWriterSettings(JsonMode.STRICT, true)) + NEWLINE); 
    662     // Can't control json output to add newlines after each array element, 
    663     // no matter which JsonMode is used. 
    664      
    665     // https://mongodb.github.io/mongo-java-driver/3.9/javadoc/index.html?org/bson/json/JsonWriterSettings.html 
    666     // Still can't control array element output, 
    667     // but this way uses newer mongo java driver 3.9(.1). Tried its various JsonModes too: 
    668     //JsonWriterSettings writeSettings = new JsonWriterSettings(); 
    669     //writeSettings.builder().outputMode(JsonMode.SHELL).indent(true).build(); 
    670     //writer.write(doc.toJson(writeSettings) + NEWLINE); 
    671  
    672     // Not the JsonWriter of mongodb java driver: 
    673     // https://stackoverflow.com/questions/54746814/jsonwriter-add-a-new-line 
    674      
    675     // Have to use gson's pretty print to produce a json string that contains 
    676     // newlines after every array element in the json: 
    677      
    678     String jsonStr = prettyPrintJson(doc.toJson()); 
    679     //System.err.println(jsonStr); 
    680     try { 
    681         writer.write(jsonStr + NEWLINE); 
    682     } catch (IOException ex) { 
    683         //throw ex; 
    684         throw new UncheckedIOException(ex); 
    685     }        
    686     } 
    687  
    688     private List sortAlphabetically(Object list) { 
    689     BsonArray domainList = (BsonArray)list; 
    690     //for(String domain : domainList) { 
    691     for(int i = domainList.size() - 1; i >= 0; i--) { 
    692         BsonString domain = domainList.get(i).asString(); 
    693         String domainStr = Utility.stripProtocolAndWWWFromURL(domain.toString()); 
    694         domainList.set(i, new BsonString(domainStr));        
    695     } 
    696  
    697     return domainList; 
    698     } 
    699      
    700     public void writeDocAsJsonRecord(int docNum, Document doc, Writer writer) throws UncheckedIOException { 
    701     String jsonStr = prettyPrintJson(doc.toJson()); 
    702     //System.err.println(jsonStr); 
    703     try { 
    704         writer.write("/* " + docNum + " */\n" + jsonStr + NEWLINE); 
    705     } catch (IOException ex) { 
    706         //throw ex; 
    707         throw new UncheckedIOException(ex); 
    708     } 
    709     } 
    710  
    711     // TODO 
    712     //public void writeDocToJsonAndCSV(int docNum, Document doc, Writer writer, CSVPrinter csvWriter) throws UncheckedIOException { 
    713     public void writeDocAsCSVRecord(int docNum, Document doc, CSVPrinter csvWriter) throws UncheckedIOException { 
    714     String jsonStr = doc.toJson(); 
    715     JsonParser parser = new JsonParser(); 
    716     JsonElement json = parser.parse(jsonStr); 
    717  
    718     JsonObject jsonObj = (JsonObject)json; 
    719      
    720     String countryCode = jsonObj.get("_id").getAsString(); 
    721     int siteCount = jsonObj.get("count").getAsInt(); 
    722     int numPagesInMRICount = jsonObj.get("numPagesInMRICount").getAsInt(); 
    723     int numPagesContainingMRICount = jsonObj.get("numPagesContainingMRICount").getAsInt();   
    724      
    725     //System.err.println(jsonStr); 
    726     try { 
    727         //writer.write("/* " + docNum + " */\n" + prettyPrintJson(jsonStr) + NEWLINE); 
    728         csvWriter.printRecord(countryCode, siteCount, numPagesInMRICount, numPagesContainingMRICount); 
    729     } catch (IOException ex) { 
    730         //throw ex; 
    731         throw new UncheckedIOException(ex); 
    732     } 
    733     } 
    734      
    735     public String prettyPrintJson(String jsonStr) { 
    736     Gson gson = new GsonBuilder().setPrettyPrinting().create(); 
    737     JsonParser jp = new JsonParser(); 
    738     JsonElement je = jp.parse(jsonStr); 
    739     String prettyJsonString = gson.toJson(je); 
    740     return prettyJsonString; 
    741     } 
    742304 
    743305     
  • other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java

    r33906 r33909  
    7777 
    7878    private String domainOfSite; 
    79     private String baseSiteDomain; // domainOfSite stripped of any http(s)://www. 
     79    //private String baseSiteDomain; // domainOfSite stripped of any http(s)://www. 
    8080    private int numPagesInMRI = 0; 
    8181    private int numPagesContainingMRI = 0; 
     
    203203        String url = firstPage.getPageURL(); 
    204204        this.domainOfSite = Utility.getDomainForURL(url, true); 
    205         this.baseSiteDomain = Utility.stripProtocolAndWWWFromURL(this.domainOfSite); 
     205        //this.baseSiteDomain = Utility.stripProtocolAndWWWFromURL(this.domainOfSite); 
    206206    } 
    207207    else { 
    208208        this.domainOfSite = "UNKNOWN"; 
    209         this.baseSiteDomain = "UNKNOWN"; 
     209        //this.baseSiteDomain = "UNKNOWN"; 
    210210    } 
    211211     
     
    343343 
    344344    WebsiteInfo website = new WebsiteInfo(/*SITE_COUNTER,*/ this.siteID, 
    345           this.domainOfSite, this.baseSiteDomain, 
     345          this.domainOfSite, //this.baseSiteDomain, 
    346346          totalPages, this.countOfWebPagesWithBodyText, 
    347347          this.numPagesInMRI, this.numPagesContainingMRI, 
  • other-projects/maori-lang-detection/src/org/greenstone/atea/WebPageURLsListing.java

    r33906 r33909  
    2929    static private final long FIXED_SEED = 1000; 
    3030     
    31     private final MongoDBAccess mongodbAccess; 
     31    private final MongoDBQueryer mongodbQueryer; 
    3232    private File outFolder; 
    3333 
     
    4949     
    5050     
    51     public WebPageURLsListing(MongoDBAccess mongodbAccess, File outFolder) 
     51    public WebPageURLsListing(MongoDBQueryer mongodbQueryer, File outFolder) 
    5252    { 
    53     this.mongodbAccess = mongodbAccess; 
     53    this.mongodbQueryer = mongodbQueryer; 
    5454    this.outFolder = outFolder; 
    5555    } 
     
    5757     
    5858    public void produceURLsForPagesInMRI(File domainsFile) { 
    59     ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDBAccess.IS_MRI, domainsFile); 
     59    ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDBQueryer.IS_MRI, domainsFile); 
    6060    File outFile = new File(outFolder, "isMRI_"+domainsFile.getName()); 
    6161    writeURLsToFile(urlsList, outFile, urlsList.size()); 
     
    6666     
    6767    public void produceURLsForPagesContainingMRI(File domainsFile) { 
    68     ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDBAccess.CONTAINS_MRI, domainsFile);     
     68    ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDBQueryer.CONTAINS_MRI, domainsFile);    
    6969    File outFile = new File(outFolder, "containsMRI_"+domainsFile.getName()); 
    7070    writeURLsToFile(urlsList, outFile, urlsList.size()); 
     
    9797            domain = domain.substring(0, index); 
    9898            } 
    99             ArrayList<String> moreURLs = mongodbAccess.queryAllMatchingURLsFilteredBy(domain, filterType); 
     99            ArrayList<String> moreURLs = mongodbQueryer.queryAllMatchingURLsFilteredBy(domain, filterType); 
    100100 
    101101            // Print out whether there were no isMRI pages for the domain (only containsMRI). A useful thing to know 
    102             if(moreURLs.size() == 0 && filterType == MongoDBAccess.IS_MRI) { 
     102            if(moreURLs.size() == 0 && filterType == MongoDBQueryer.IS_MRI) { 
    103103            System.out.println("   " + countryCode + " domain " + domain + " had no isMRI webpages - only containsMRI."); 
    104104            } 
     
    133133    public void mriWebPageListingForDomainListing(File domainsFile) { 
    134134 
    135     int filterType = MongoDBAccess.IS_MRI; 
     135    int filterType = MongoDBQueryer.IS_MRI; 
    136136     
    137137    // for overseas websites,  
     
    139139 
    140140    // 0. get a list of all the web pages in the given domain listing where isMRI = true 
    141     ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDBAccess.IS_MRI, domainsFile); 
     141    ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDBQueryer.IS_MRI, domainsFile); 
    142142        // produceURLsForPagesInMRI(domainsFile); 
    143143     
     
    148148    // 2. write all the URLs in urlsList to a file 
    149149    //File outFolder = domainsFile.getParentFile(); 
    150     String fileName = (filterType == MongoDBAccess.IS_MRI) ? "isMRI_" : "containsMRI_"; 
     150    String fileName = (filterType == MongoDBQueryer.IS_MRI) ? "isMRI_" : "containsMRI_"; 
    151151    File outFile = new File(outFolder, fileName+domainsFile.getName()); 
    152152 
     
    219219 
    220220    /* ---------------------------------------- */ 
    221  
    222221    /**  
    223      * Create the file 5counts_tentativeNonAutotranslatedSites.json 
     222     * Create the file 5counts_containsMRISites_allNZGrouped.json 
     223     * that contains the count and domains for NZ sites (NZ origin or nz TLD) with pages 
     224     * that CONTAIN_MRI, followed by counts and domains listing for overseas sites 
     225     * that CONTAIN_MRI. 
     226     * @return full path of file generated 
     227     */ 
     228    public String writeContainsMRISites_nzSitesAndTLDsGrouped() { 
     229     
     230    File outFile = new File(outFolder, "5counts_containsMRISites_allNZGrouped.json"); 
     231 
     232    String filename = Utility.getFilePath(outFile); 
     233     
     234    try ( 
     235         Writer writer = new BufferedWriter(new FileWriter(outFile)); 
     236         ) { 
     237        // first write out NZ sites and .nz TLD count and domains 
     238        mongodbQueryer.aggregateContainsMRIForNZ(writer, MongoDBQueryer.CONTAINS_MRI); 
     239        // next write out all overseas sites (not NZ origin or .nz TLD) 
     240        // that have no "mi" in the URL path as mi.* or */mi 
     241        boolean isMiInURLPath = false; 
     242        mongodbQueryer.aggregateContainsMRIForOverseas(writer, MongoDBQueryer.CONTAINS_MRI); 
     243         
     244    } catch(Exception e) { 
     245        logger.error("Unable to write to file " + filename); 
     246        logger.error(e.getMessage(), e); 
     247    } 
     248     
     249    System.err.println("*** Wrote file: " + filename); 
     250 
     251    return filename; 
     252    } 
     253     
     254    /**  
     255     * Create the file 5a_counts_tentativeNonAutotranslatedSites.json 
    224256     * that contains the count and domains for NZ sites (NZ origin or nz TLD) that CONTAIN_MRI 
    225257     * followed by counts and domain listing for overseas sites that are either from Australia 
     
    237269         ) { 
    238270        // first write out NZ sites and .nz TLD count and domains 
    239         mongodbAccess.aggregateContainsMRIForNZ(writer, MongoDBAccess.CONTAINS_MRI); 
     271        mongodbQueryer.aggregateContainsMRIForNZ(writer, MongoDBQueryer.CONTAINS_MRI); 
    240272        // next write out all overseas sites (not NZ origin or .nz TLD) 
    241273        // that have no "mi" in the URL path as mi.* or */mi 
    242274        boolean isMiInURLPath = false; 
    243         mongodbAccess.aggregateContainsMRIForOverseas(writer, MongoDBAccess.CONTAINS_MRI, isMiInURLPath); 
     275        mongodbQueryer.aggregateContainsMRIForOverseas(writer, MongoDBQueryer.CONTAINS_MRI, isMiInURLPath); 
    244276         
    245277    } catch(Exception e) { 
     
    254286 
    255287    /** 
     288     * Create the file 5b_counts_overseasSitesWithMiInPath.json 
    256289     * Listing of the remainder of overseas sites that CONTAIN_MRI not included by 
    257290     * writeTentativeNonAutotranslatedSites(): those that have mi in their URL path. 
     
    267300         ) { 
    268301        boolean isMiInURLPath = true; 
    269         mongodbAccess.aggregateContainsMRIForOverseas(writer, MongoDBAccess.CONTAINS_MRI, isMiInURLPath); 
     302        mongodbQueryer.aggregateContainsMRIForOverseas(writer, MongoDBQueryer.CONTAINS_MRI, isMiInURLPath); 
    270303 
    271304    } catch(Exception e) { 
     
    300333 
    301334    try ( 
    302          MongoDBAccess mongodb = new MongoDBAccess(); 
     335         MongoDBQueryer mongodb = new MongoDBQueryer(); 
    303336         ) { 
    304337 
     
    335368        //System.err.println("For N = " + 4360 + ", n = " + listing.calcSampleSize(4360)); 
    336369        //System.err.println("For N = " + 681 + ", n = " + listing.calcSampleSize(681)); 
    337          
    338         String filename = listing.writeTentativeNonAutotranslatedSites(); 
     370 
     371        // get all sites where >0 pages have containsMRI=true 
     372        // grouping NZ sites and .nz TLDs together and remainder under overseas 
     373        // geolocations. 
     374        String filename = listing.writeContainsMRISites_nzSitesAndTLDsGrouped(); 
     375 
     376        // separately: 
     377        // - all NZ containsMRI + overseas tentative non-product sites with containMRI 
     378        // - overseas tentative product sites with containMRI 
     379        filename = listing.writeTentativeNonAutotranslatedSites(); 
    339380        filename = listing.writeOverseasSitesWithMiInURLPath(); 
    340381 
  • other-projects/maori-lang-detection/src/org/greenstone/atea/morphia/WebsiteInfo.java

    r33906 r33909  
    99    public final String siteFolderName; 
    1010    public final String domain; 
    11     public final String basicDomain; // domain without protocol and www. prefix 
     11    //public final String basicDomain; // domain without protocol and www. prefix 
    1212     
    1313    public final int totalPages; 
     
    2525     
    2626    public WebsiteInfo(/*int siteCount,*/ String siteFolderName, 
    27                String domainOfSite, String baseSiteDomain, 
     27               String domainOfSite, //String baseSiteDomain, 
    2828               int totalPages, int countOfWebPagesWithBodyText, 
    2929               int numPagesInMRI, int numPagesContainingMRI, 
     
    3434    this.siteFolderName = siteFolderName; 
    3535    this.domain = domainOfSite; 
    36     this.basicDomain = baseSiteDomain; 
     36    //this.basicDomain = baseSiteDomain; 
    3737     
    3838    this.totalPages = totalPages;