Changeset 33801
- Timestamp:
- 2019-12-13T18:40:46+13:00 (4 years ago)
- Location:
- other-projects/maori-lang-detection/src/org/greenstone/atea
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java
r33800 r33801 78 78 private String domainOfSite; 79 79 private int numPagesInMRI = 0; 80 private int numPagesContainingMRI = 0; 80 81 81 82 /** keep a list to store the text of each page */ … … 86 87 /** Number of language and confidence results to return for storing in MongoDB 87 88 * MongoDB runs out of space if storing too many, as we store this info per sentence 88 * and a long text document becomes a very large MongoDB document presumabl e*/89 * and a long text document becomes a very large MongoDB document presumably */ 89 90 private static final int NUM_TOP_LANGUAGES = 3; // 103 max, in current version of opennlp lang model 90 91 … … 262 263 webpage.setMRISentenceCount(numSentencesInMRI); 263 264 webpage.setContainsMRI((numSentencesInMRI > 0)); 264 265 if(numSentencesInMRI > 0) { // if(numSentencesInMRI >= 5) { 266 // Not sure if we can trust that a single sentence detected as Maori on a page is really Maori 267 // But if at least 5 sentences are detected as Maori, it is more likely to be the case to be MRI? 268 numPagesContainingMRI++; 269 } 270 265 271 //mongodbAccess.insertWebpageInfo(webpage); 266 272 // Uses morphia to save to mongodb, see https://www.baeldung.com/mongodb-morphia … … 298 304 } catch(Exception e) { 299 305 logger.error("*** For SiteID " + siteID + ", got exception: " + e.getMessage(), e); 300 this.geoLocationCountryCode = "UNKNOWN"; // couldn't get the country code, so should also be UNKNOWN not null 306 307 //if(this.domainOfSite.endsWith(".nz")) { // nz TLDs are worth counting 308 //this.geoLocationCountryCode = "NZ"; 309 //} 310 311 // Help along identification of domain's country by construing TLDs if 2 letters after last period mark 312 int periodIndex = domainOfSite.length()-3; 313 // .com|org etc extensions that have 3 chars afte period mark will remain unknown 314 // 2 letter extensions will be considered TLD 315 if(periodIndex >=0 && domainOfSite.charAt(periodIndex) == '.' && ((periodIndex+1) < domainOfSite.length())) { 316 // has a 2 letter TLD. Make it uppercase to match return value of Utility.getCountryCodeOfDomain() above 317 String TLD = domainOfSite.substring(periodIndex+1); 318 this.geoLocationCountryCode = TLD.toUpperCase(); 319 } else { 320 this.geoLocationCountryCode = "UNKNOWN"; // couldn't get the country code, so should also be UNKNOWN not null 321 } 301 322 } 302 323 … … 304 325 305 326 WebsiteInfo website = new WebsiteInfo(/*SITE_COUNTER,*/ this.siteID, this.domainOfSite, 306 totalPages, this.countOfWebPagesWithBodyText, this.numPagesInMRI, 327 totalPages, this.countOfWebPagesWithBodyText, 328 this.numPagesInMRI, this.numPagesContainingMRI, 307 329 this.siteCrawledTimestamp, this.siteCrawlUnfinished, redoCrawl, 308 330 this.geoLocationCountryCode, this.urlContainsLangCodeInPath); -
other-projects/maori-lang-detection/src/org/greenstone/atea/morphia/WebsiteInfo.java
r33698 r33801 12 12 public final int totalPages; 13 13 public final int countOfWebPagesWithBodyText; 14 14 15 public final int numPagesInMRI; 16 public final int numPagesContainingMRI; 15 17 16 18 public final long siteCrawledTimestamp; … … 22 24 23 25 public WebsiteInfo(/*int siteCount,*/ String siteFolderName, String domainOfSite, 24 int totalPages, int countOfWebPagesWithBodyText, int numPagesInMRI, 26 int totalPages, int countOfWebPagesWithBodyText, 27 int numPagesInMRI, int numPagesContainingMRI, 25 28 long siteCrawledTimestamp, boolean siteCrawlUnfinished, boolean redoCrawl, 26 29 String geoLocationCountryCode, boolean urlContainsLangCodeInPath) … … 32 35 this.totalPages = totalPages; 33 36 this.countOfWebPagesWithBodyText = countOfWebPagesWithBodyText; 37 34 38 this.numPagesInMRI = numPagesInMRI; 39 this.numPagesContainingMRI = numPagesContainingMRI; 35 40 36 41 this.siteCrawledTimestamp = siteCrawledTimestamp;
Note:
See TracChangeset
for help on using the changeset viewer.