Changeset 33652
- Timestamp:
- 2019-11-12T20:41:13+13:00 (4 years ago)
- Location:
- other-projects/maori-lang-detection/src/org/greenstone/atea
- Files:
-
- 1 added
- 4 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java
r33651 r33652 24 24 import org.apache.log4j.Logger; 25 25 26 import org.greenstone.atea.morphia.*; 27 import dev.morphia.*; 26 28 27 29 /** … … 68 70 private MongoClient mongo = null; 69 71 private MongoDatabase database = null; 70 71 72 73 /** 74 * Mongodb Client handle via morphia, which handles the ODM (object document mapper) 75 * for MongoDB 76 */ 77 public Datastore datastore = null; 78 72 79 public MongoDBAccess() throws Exception { 73 80 boolean success = false; … … 141 148 this.database = mongo.getDatabase(DB_NAME); 142 149 */ 150 151 Morphia morphia = new Morphia(); 152 morphia.mapPackage("com.greenstone.atea.morphia"); 153 datastore = morphia.createDatastore(mongo, DB_NAME); 154 datastore.ensureIndexes(); 143 155 144 156 } -
other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java
r33634 r33652 9 9 import org.apache.commons.csv.*; 10 10 import org.apache.log4j.Logger; 11 12 //import org.bson.types.ObjectId; 13 14 import org.greenstone.atea.morphia.*; 11 15 12 16 … … 223 227 String[] sentences = maoriTxtDetector.getAllSentences(text); 224 228 int totalSentences = sentences.length; 229 int numSentencesInMRI = 0; 225 230 ArrayList<SentenceInfo> singleSentences = maoriTxtDetector.getAllSentencesInfo(sentences); 226 231 ArrayList<SentenceInfo> overlappingSentences = maoriTxtDetector.getAllOverlappingSentencesInfo(sentences); 227 228 WebpageInfo webpage = page.convertStoredDataToWebpageInfo(WEBPAGE_COUNTER ,229 SITE_COUNTER,232 233 WebpageInfo webpage = page.convertStoredDataToWebpageInfo(WEBPAGE_COUNTER/*new ObjectId()*/, 234 this.siteID/*SITE_COUNTER*/, 230 235 isMRI, 231 236 totalSentences, … … 233 238 overlappingSentences); 234 239 235 236 mongodbAccess.insertWebpageInfo(webpage); 240 for(SentenceInfo si : singleSentences) { 241 if(si.langCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) { 242 numSentencesInMRI++; 243 } 244 } 245 webpage.setMRISentenceCount(numSentencesInMRI); 246 webpage.setContainsMRI((numSentencesInMRI > 0)); 247 248 //mongodbAccess.insertWebpageInfo(webpage); 249 mongodbAccess.datastore.save(webpage); 237 250 } 238 251 } … … 291 304 } 292 305 293 //File geoLiteCityDatFile = new File(this.getClass().getResource("GeoLiteCity.dat").getFile()); 294 //this.geoLocationCountryCode = getCountryCodeOfDomain(this.domainOfSite, geoLiteCityDatFile); 306 File geoLiteCityDatFile = new File(this.getClass().getClassLoader().getResource("GeoLiteCity.dat").getFile()); 307 try { 308 this.geoLocationCountryCode = Utility.getCountryCodeOfDomain(this.domainOfSite, geoLiteCityDatFile); 309 } catch(Exception e) { 310 e.printStackTrace(); 311 this.geoLocationCountryCode = null; 312 } 295 313 296 314 int totalPages = pages.size(); 297 315 298 WebsiteInfo website = new WebsiteInfo( SITE_COUNTER,this.siteID, this.domainOfSite,316 WebsiteInfo website = new WebsiteInfo(/*SITE_COUNTER,*/ this.siteID, this.domainOfSite, 299 317 totalPages, this.countOfWebPagesWithBodyText, this.numPagesInMRI, 300 318 this.siteCrawledTimestamp, this.siteCrawlUnfinished, redoCrawl, 301 319 this.geoLocationCountryCode, this.urlContainsLangCodeInPath); 302 320 303 mongodbAccess.insertWebsiteInfo(website);304 321 //mongodbAccess.insertWebsiteInfo(website); 322 mongodbAccess.datastore.save(website); 305 323 } 306 324 -
other-projects/maori-lang-detection/src/org/greenstone/atea/TextDumpPage.java
r33634 r33652 8 8 import org.apache.log4j.Logger; 9 9 10 import org.greenstone.atea.morphia.*; 10 11 11 12 public class TextDumpPage { … … 178 179 */ 179 180 public WebpageInfo convertStoredDataToWebpageInfo( 180 long webpageID, int websiteID, boolean isMRI, int totalSentences,181 long webpageID, String siteID /*int websiteID*/, boolean isMRI, int totalSentences, 181 182 ArrayList<SentenceInfo> singleSentences, ArrayList<SentenceInfo> overlappingSentences) 182 183 { … … 188 189 String fetchTime = getFetchTime(); 189 190 190 WebpageInfo webpage = new WebpageInfo(webpageID, websiteID,191 WebpageInfo webpage = new WebpageInfo(webpageID, siteID/*websiteID,*/, 191 192 pageText, pageURL, isMRI, totalSentences, 192 193 charEncoding, modifiedTime, fetchTime, -
other-projects/maori-lang-detection/src/org/greenstone/atea/TextLanguageDetector.java
r33651 r33652 29 29 import java.util.ArrayList; 30 30 31 import org.greenstone.atea.morphia.*; 32 31 33 /** 32 34 * EXPORT OPENNLP_HOME environment variable to be your apache OpenNLP installation.
Note:
See TracChangeset
for help on using the changeset viewer.