Changeset 33808 for other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java
- Timestamp:
- 2019-12-17T19:31:28+13:00 (4 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java
r33801 r33808 74 74 75 75 private String geoLocationCountryCode = null; /** 2 letter country code */ 76 private boolean urlContainsLangCodeInPath = false; /** If any URL on this site contains a /mi(/) in its URL */ 76 private boolean urlContainsLangCodeInPathSuffix = false; /** If any URL on this site contains a /mi(/) in its URL */ 77 private boolean urlContainsLangCodeInPathPrefix = false; /** If any URL on this site contains a http(s)://mi.* in its URL */ 77 78 78 79 private String domainOfSite; … … 191 192 // contains /mi(/) in its URL 192 193 String url = page.getPageURL(); 193 if(!this.urlContainsLangCodeInPath && (url.contains("/mi/") || url.endsWith("/mi"))) { 194 this.urlContainsLangCodeInPath = true; 194 if(!this.urlContainsLangCodeInPathSuffix && (url.contains("/mi/") || url.endsWith("/mi"))) { 195 this.urlContainsLangCodeInPathSuffix = true; 196 } 197 if(!this.urlContainsLangCodeInPathPrefix && (url.startsWith("https://mi.") || url.startsWith("http://mi."))) { 198 this.urlContainsLangCodeInPathPrefix = true; 195 199 } 196 200 } … … 328 332 this.numPagesInMRI, this.numPagesContainingMRI, 329 333 this.siteCrawledTimestamp, this.siteCrawlUnfinished, redoCrawl, 330 this.geoLocationCountryCode, this.urlContainsLangCodeInPath );334 this.geoLocationCountryCode, this.urlContainsLangCodeInPathSuffix, this.urlContainsLangCodeInPathPrefix); 331 335 332 336 //mongodbAccess.insertWebsiteInfo(website);
Note:
See TracChangeset
for help on using the changeset viewer.