Changeset 33810 for other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java
- Timestamp:
- 2019-12-17T21:48:08+13:00 (4 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java
r33808 r33810 55 55 static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpToMongoDB.class.getName()); 56 56 57 static boolean DEBUG_MODE = true; 57 static boolean DEBUG_MODE = true; // this is set to false in main() at the end of this class 58 58 59 59 /** Counter for number of sites. … … 159 159 //page.getText(); 160 160 pages.add(page); 161 inspectPageURLPath(page); 161 162 pageDump = null; 162 163 … … 176 177 // can throw away any newlines between text start and end markers. 177 178 178 prevLine = line; 179 prevLine = line; 179 180 } 180 181 … … 187 188 TextDumpPage page = new TextDumpPage(siteID, pageDump.toString()); 188 189 pages.add(page); 190 inspectPageURLPath(page); 189 191 pageDump = null; 190 191 // for every site, we just need to work out if any of its pages 192 // contains /mi(/) in its URL 193 String url = page.getPageURL(); 194 if(!this.urlContainsLangCodeInPathSuffix && (url.contains("/mi/") || url.endsWith("/mi"))) { 195 this.urlContainsLangCodeInPathSuffix = true; 196 } 197 if(!this.urlContainsLangCodeInPathPrefix && (url.startsWith("https://mi.") || url.startsWith("http://mi."))) { 198 this.urlContainsLangCodeInPathPrefix = true; 199 } 200 } 192 } 193 201 194 202 195 } catch (IOException ioe) { … … 215 208 } 216 209 217 210 /* 211 // for every site, we just need to work out if any of its pages 212 // contains /mi(/) in its URL 213 for(TextDumpPage aPage : pages) { 214 String url = aPage.getPageURL(); 215 logger.debug("@@@@ pageURL: " + url); 216 if(!this.urlContainsLangCodeInPathSuffix && (url.contains("/mi/") || url.endsWith("/mi"))) { 217 this.urlContainsLangCodeInPathSuffix = true; 218 logger.info("*********** URL CONTAINS SUFFIX"); 219 } 220 // And if any contains http(s)://mi. in its URL 221 if(!this.urlContainsLangCodeInPathPrefix && (url.startsWith("https://mi.") || url.startsWith("http://mi."))) { 222 this.urlContainsLangCodeInPathPrefix = true; 223 } 224 } 225 */ 218 226 prepareSiteStats(mongodbAccess); 219 227 } 220 228 229 /** for every site, we just need to work out if any of its pages contains /mi(/) or http(s)://mi. in its URL. 230 * This method is called on each page of a site as the page is created. */ 231 private void inspectPageURLPath(TextDumpPage page) { 232 String url = page.getPageURL(); 233 //logger.debug("@@@@ pageURL: " + url); 234 235 // check if each page in site contains /mi(/) in URL, and if so set a site-level variable accordingly 236 if(!this.urlContainsLangCodeInPathSuffix && (url.contains("/mi/") || url.endsWith("/mi"))) { 237 this.urlContainsLangCodeInPathSuffix = true; 238 } 239 // And if any page contains http(s)://mi. in its URL, then set site level variable for this accordingly 240 if(!this.urlContainsLangCodeInPathPrefix && (url.startsWith("https://mi.") || url.startsWith("http://mi."))) { 241 this.urlContainsLangCodeInPathPrefix = true; 242 } 243 } 244 221 245 222 246 private void prepareSiteStats(MongoDBAccess mongodbAccess) throws IOException {
Note:
See TracChangeset
for help on using the changeset viewer.