Changeset 33657 for other-projects
- Timestamp:
- 2019-11-12T21:33:57+13:00 (4 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java
r33656 r33657 171 171 // process final webpage record: 172 172 //debugPageDump(pageDump); 173 174 TextDumpPage page = new TextDumpPage(siteID, pageDump.toString()); 175 pages.add(page); 176 pageDump = null; 177 178 // for every site, we just need to work out if any of its pages 179 // contains /mi(/) in its URL 180 String url = page.getPageURL(); 181 if(!this.urlContainsLangCodeInPath && (url.contains("/mi/") || url.endsWith("/mi"))) { 182 this.urlContainsLangCodeInPath = true; 173 174 if(pageDump == null) { 175 logger.warn("siteID " + siteID + " had an empty dump.txt file. Reinspect site."); 176 } else { 177 TextDumpPage page = new TextDumpPage(siteID, pageDump.toString()); 178 pages.add(page); 179 pageDump = null; 180 181 // for every site, we just need to work out if any of its pages 182 // contains /mi(/) in its URL 183 String url = page.getPageURL(); 184 if(!this.urlContainsLangCodeInPath && (url.contains("/mi/") || url.endsWith("/mi"))) { 185 this.urlContainsLangCodeInPath = true; 186 } 183 187 } 184 188 … … 306 310 File geoLiteCityDatFile = new File(this.getClass().getClassLoader().getResource("GeoLiteCity.dat").getFile()); 307 311 try { 308 this.geoLocationCountryCode = Utility.getCountryCodeOfDomain(this.domainOfSite, geoLiteCityDatFile); 309 } catch(Exception e) { 310 e.printStackTrace(); 312 if(this.domainOfSite.equals("UNKNOWN")) { 313 this.geoLocationCountryCode = "UNKNOWN"; 314 } else { 315 this.geoLocationCountryCode = Utility.getCountryCodeOfDomain(this.domainOfSite, geoLiteCityDatFile); 316 } 317 } catch(Exception e) { 318 logger.error("*** For SiteID " + siteID + ", got exception: " + e.getMessage(), e); 311 319 this.geoLocationCountryCode = null; 312 320 }
Note:
See TracChangeset
for help on using the changeset viewer.