Changeset 33652 for other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java
- Timestamp:
- 2019-11-12T20:41:13+13:00 (4 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java
r33634 r33652 9 9 import org.apache.commons.csv.*; 10 10 import org.apache.log4j.Logger; 11 12 //import org.bson.types.ObjectId; 13 14 import org.greenstone.atea.morphia.*; 11 15 12 16 … … 223 227 String[] sentences = maoriTxtDetector.getAllSentences(text); 224 228 int totalSentences = sentences.length; 229 int numSentencesInMRI = 0; 225 230 ArrayList<SentenceInfo> singleSentences = maoriTxtDetector.getAllSentencesInfo(sentences); 226 231 ArrayList<SentenceInfo> overlappingSentences = maoriTxtDetector.getAllOverlappingSentencesInfo(sentences); 227 228 WebpageInfo webpage = page.convertStoredDataToWebpageInfo(WEBPAGE_COUNTER ,229 SITE_COUNTER,232 233 WebpageInfo webpage = page.convertStoredDataToWebpageInfo(WEBPAGE_COUNTER/*new ObjectId()*/, 234 this.siteID/*SITE_COUNTER*/, 230 235 isMRI, 231 236 totalSentences, … … 233 238 overlappingSentences); 234 239 235 236 mongodbAccess.insertWebpageInfo(webpage); 240 for(SentenceInfo si : singleSentences) { 241 if(si.langCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) { 242 numSentencesInMRI++; 243 } 244 } 245 webpage.setMRISentenceCount(numSentencesInMRI); 246 webpage.setContainsMRI((numSentencesInMRI > 0)); 247 248 //mongodbAccess.insertWebpageInfo(webpage); 249 mongodbAccess.datastore.save(webpage); 237 250 } 238 251 } … … 291 304 } 292 305 293 //File geoLiteCityDatFile = new File(this.getClass().getResource("GeoLiteCity.dat").getFile()); 294 //this.geoLocationCountryCode = getCountryCodeOfDomain(this.domainOfSite, geoLiteCityDatFile); 306 File geoLiteCityDatFile = new File(this.getClass().getClassLoader().getResource("GeoLiteCity.dat").getFile()); 307 try { 308 this.geoLocationCountryCode = Utility.getCountryCodeOfDomain(this.domainOfSite, geoLiteCityDatFile); 309 } catch(Exception e) { 310 e.printStackTrace(); 311 this.geoLocationCountryCode = null; 312 } 295 313 296 314 int totalPages = pages.size(); 297 315 298 WebsiteInfo website = new WebsiteInfo( SITE_COUNTER,this.siteID, this.domainOfSite,316 WebsiteInfo website = new WebsiteInfo(/*SITE_COUNTER,*/ this.siteID, this.domainOfSite, 299 317 totalPages, this.countOfWebPagesWithBodyText, this.numPagesInMRI, 300 318 this.siteCrawledTimestamp, this.siteCrawlUnfinished, redoCrawl, 301 319 this.geoLocationCountryCode, this.urlContainsLangCodeInPath); 302 320 303 mongodbAccess.insertWebsiteInfo(website);304 321 //mongodbAccess.insertWebsiteInfo(website); 322 mongodbAccess.datastore.save(website); 305 323 } 306 324
Note:
See TracChangeset
for help on using the changeset viewer.