Changeset 33801 for other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java
- Timestamp:
- 2019-12-13T18:40:46+13:00 (4 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java
r33800 r33801 78 78 private String domainOfSite; 79 79 private int numPagesInMRI = 0; 80 private int numPagesContainingMRI = 0; 80 81 81 82 /** keep a list to store the text of each page */ … … 86 87 /** Number of language and confidence results to return for storing in MongoDB 87 88 * MongoDB runs out of space if storing too many, as we store this info per sentence 88 * and a long text document becomes a very large MongoDB document presumabl e*/89 * and a long text document becomes a very large MongoDB document presumably */ 89 90 private static final int NUM_TOP_LANGUAGES = 3; // 103 max, in current version of opennlp lang model 90 91 … … 262 263 webpage.setMRISentenceCount(numSentencesInMRI); 263 264 webpage.setContainsMRI((numSentencesInMRI > 0)); 264 265 if(numSentencesInMRI > 0) { // if(numSentencesInMRI >= 5) { 266 // Not sure if we can trust that a single sentence detected as Maori on a page is really Maori 267 // But if at least 5 sentences are detected as Maori, it is more likely to be the case to be MRI? 268 numPagesContainingMRI++; 269 } 270 265 271 //mongodbAccess.insertWebpageInfo(webpage); 266 272 // Uses morphia to save to mongodb, see https://www.baeldung.com/mongodb-morphia … … 298 304 } catch(Exception e) { 299 305 logger.error("*** For SiteID " + siteID + ", got exception: " + e.getMessage(), e); 300 this.geoLocationCountryCode = "UNKNOWN"; // couldn't get the country code, so should also be UNKNOWN not null 306 307 //if(this.domainOfSite.endsWith(".nz")) { // nz TLDs are worth counting 308 //this.geoLocationCountryCode = "NZ"; 309 //} 310 311 // Help along identification of domain's country by construing TLDs if 2 letters after last period mark 312 int periodIndex = domainOfSite.length()-3; 313 // .com|org etc extensions that have 3 chars afte period mark will remain unknown 314 // 2 letter extensions will be considered TLD 315 if(periodIndex >=0 && domainOfSite.charAt(periodIndex) == '.' && ((periodIndex+1) < domainOfSite.length())) { 316 // has a 2 letter TLD. Make it uppercase to match return value of Utility.getCountryCodeOfDomain() above 317 String TLD = domainOfSite.substring(periodIndex+1); 318 this.geoLocationCountryCode = TLD.toUpperCase(); 319 } else { 320 this.geoLocationCountryCode = "UNKNOWN"; // couldn't get the country code, so should also be UNKNOWN not null 321 } 301 322 } 302 323 … … 304 325 305 326 WebsiteInfo website = new WebsiteInfo(/*SITE_COUNTER,*/ this.siteID, this.domainOfSite, 306 totalPages, this.countOfWebPagesWithBodyText, this.numPagesInMRI, 327 totalPages, this.countOfWebPagesWithBodyText, 328 this.numPagesInMRI, this.numPagesContainingMRI, 307 329 this.siteCrawledTimestamp, this.siteCrawlUnfinished, redoCrawl, 308 330 this.geoLocationCountryCode, this.urlContainsLangCodeInPath);
Note:
See TracChangeset
for help on using the changeset viewer.