Changeset 33808 for other-projects
- Timestamp:
- 2019-12-17T19:31:28+13:00 (4 years ago)
- Location:
- other-projects/maori-lang-detection/src/org/greenstone/atea
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java
r33801 r33808 74 74 75 75 private String geoLocationCountryCode = null; /** 2 letter country code */ 76 private boolean urlContainsLangCodeInPath = false; /** If any URL on this site contains a /mi(/) in its URL */ 76 private boolean urlContainsLangCodeInPathSuffix = false; /** If any URL on this site contains a /mi(/) in its URL */ 77 private boolean urlContainsLangCodeInPathPrefix = false; /** If any URL on this site contains a http(s)://mi.* in its URL */ 77 78 78 79 private String domainOfSite; … … 191 192 // contains /mi(/) in its URL 192 193 String url = page.getPageURL(); 193 if(!this.urlContainsLangCodeInPath && (url.contains("/mi/") || url.endsWith("/mi"))) { 194 this.urlContainsLangCodeInPath = true; 194 if(!this.urlContainsLangCodeInPathSuffix && (url.contains("/mi/") || url.endsWith("/mi"))) { 195 this.urlContainsLangCodeInPathSuffix = true; 196 } 197 if(!this.urlContainsLangCodeInPathPrefix && (url.startsWith("https://mi.") || url.startsWith("http://mi."))) { 198 this.urlContainsLangCodeInPathPrefix = true; 195 199 } 196 200 } … … 328 332 this.numPagesInMRI, this.numPagesContainingMRI, 329 333 this.siteCrawledTimestamp, this.siteCrawlUnfinished, redoCrawl, 330 this.geoLocationCountryCode, this.urlContainsLangCodeInPath );334 this.geoLocationCountryCode, this.urlContainsLangCodeInPathSuffix, this.urlContainsLangCodeInPathPrefix); 331 335 332 336 //mongodbAccess.insertWebsiteInfo(website); -
other-projects/maori-lang-detection/src/org/greenstone/atea/morphia/WebsiteInfo.java
r33801 r33808 21 21 22 22 public final String geoLocationCountryCode; 23 public final boolean urlContainsLangCodeInPath; 23 public final boolean urlContainsLangCodeInPathSuffix; 24 public final boolean urlContainsLangCodeInPathPrefix; 24 25 25 26 public WebsiteInfo(/*int siteCount,*/ String siteFolderName, String domainOfSite, … … 27 28 int numPagesInMRI, int numPagesContainingMRI, 28 29 long siteCrawledTimestamp, boolean siteCrawlUnfinished, boolean redoCrawl, 29 String geoLocationCountryCode, boolean urlContainsLangCodeInPath )30 String geoLocationCountryCode, boolean urlContainsLangCodeInPathSuffix, boolean urlContainsLangCodeInPathPrefix) 30 31 { 31 32 //this.id = siteCount; … … 44 45 45 46 this.geoLocationCountryCode = geoLocationCountryCode; 46 this.urlContainsLangCodeInPath = urlContainsLangCodeInPath; 47 this.urlContainsLangCodeInPathSuffix = urlContainsLangCodeInPathSuffix; 48 this.urlContainsLangCodeInPathPrefix = urlContainsLangCodeInPathPrefix; 47 49 } 48 50 }
Note:
See TracChangeset
for help on using the changeset viewer.