Changeset 33811 for other-projects/maori-lang-detection/src
- Timestamp:
- 2019-12-18T16:51:34+13:00 (4 years ago)
- Location:
- other-projects/maori-lang-detection/src/org/greenstone/atea
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java
r33810 r33811 74 74 75 75 private String geoLocationCountryCode = null; /** 2 letter country code */ 76 private boolean urlContainsLangCodeInPathSuffix = false; /** If any URL on this site contains a /mi(/) in its URL */ 77 private boolean urlContainsLangCodeInPathPrefix = false; /** If any URL on this site contains a http(s)://mi.* in its URL */ 76 private boolean urlContainsLangCodeInPath = false; /** If any URL on this site contains a /mi(/) or http(s)://mi.* in its URL path */ 78 77 79 78 private String domainOfSite; … … 208 207 } 209 208 210 /* 211 // for every site, we just need to work out if any of its pages 212 // contains /mi(/) in its URL 209 /* No need to loop again through all pages. Instead, just inspectPageURLPath() as each page is created above. 210 // For any site, we just need to work out if any of its pages contains /mi(/) or http(s)://mi.* in its URL path 213 211 for(TextDumpPage aPage : pages) { 214 String url = aPage.getPageURL(); 215 logger.debug("@@@@ pageURL: " + url); 216 if(!this.urlContainsLangCodeInPathSuffix && (url.contains("/mi/") || url.endsWith("/mi"))) { 217 this.urlContainsLangCodeInPathSuffix = true; 218 logger.info("*********** URL CONTAINS SUFFIX"); 219 } 220 // And if any contains http(s)://mi. in its URL 221 if(!this.urlContainsLangCodeInPathPrefix && (url.startsWith("https://mi.") || url.startsWith("http://mi."))) { 222 this.urlContainsLangCodeInPathPrefix = true; 223 } 212 inspectPageURLPath(aPage); 224 213 } 225 214 */ … … 233 222 //logger.debug("@@@@ pageURL: " + url); 234 223 235 // check if each page in site contains /mi(/) in URL, and if so set a site-level variable accordingly 236 if(!this.urlContainsLangCodeInPathSuffix && (url.contains("/mi/") || url.endsWith("/mi"))) { 237 this.urlContainsLangCodeInPathSuffix = true; 238 } 239 // And if any page contains http(s)://mi. in its URL, then set site level variable for this accordingly 240 if(!this.urlContainsLangCodeInPathPrefix && (url.startsWith("https://mi.") || url.startsWith("http://mi."))) { 241 this.urlContainsLangCodeInPathPrefix = true; 224 if(!this.urlContainsLangCodeInPath) { // if not already set to true for any previous page in this site, 225 // check if this page of the site contains /mi(/) or http(s)://mi in its URL path 226 if(url.contains("/mi/") || url.endsWith("/mi") || url.startsWith("https://mi.") || url.startsWith("http://mi.")) { 227 this.urlContainsLangCodeInPath = true; 228 } 242 229 } 243 230 } … … 356 343 this.numPagesInMRI, this.numPagesContainingMRI, 357 344 this.siteCrawledTimestamp, this.siteCrawlUnfinished, redoCrawl, 358 this.geoLocationCountryCode, this.urlContainsLangCodeInPath Suffix, this.urlContainsLangCodeInPathPrefix);345 this.geoLocationCountryCode, this.urlContainsLangCodeInPath); 359 346 360 347 //mongodbAccess.insertWebsiteInfo(website); -
other-projects/maori-lang-detection/src/org/greenstone/atea/morphia/WebsiteInfo.java
r33808 r33811 21 21 22 22 public final String geoLocationCountryCode; 23 public final boolean urlContainsLangCodeInPathSuffix; 24 public final boolean urlContainsLangCodeInPathPrefix; 23 public final boolean urlContainsLangCodeInPath; 25 24 26 25 public WebsiteInfo(/*int siteCount,*/ String siteFolderName, String domainOfSite, … … 28 27 int numPagesInMRI, int numPagesContainingMRI, 29 28 long siteCrawledTimestamp, boolean siteCrawlUnfinished, boolean redoCrawl, 30 String geoLocationCountryCode, boolean urlContainsLangCodeInPath Suffix, boolean urlContainsLangCodeInPathPrefix)29 String geoLocationCountryCode, boolean urlContainsLangCodeInPath) 31 30 { 32 31 //this.id = siteCount; … … 45 44 46 45 this.geoLocationCountryCode = geoLocationCountryCode; 47 this.urlContainsLangCodeInPathSuffix = urlContainsLangCodeInPathSuffix; 48 this.urlContainsLangCodeInPathPrefix = urlContainsLangCodeInPathPrefix; 46 this.urlContainsLangCodeInPath = urlContainsLangCodeInPath; 49 47 } 50 48 }
Note:
See TracChangeset
for help on using the changeset viewer.