Changeset 33634 for gs3-extensions
- Timestamp:
- 2019-11-08T23:59:07+13:00 (4 years ago)
- Location:
- gs3-extensions/maori-lang-detection/src/org/greenstone/atea
- Files:
-
- 4 added
- 4 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java
r33633 r33634 4 4 import com.mongodb.client.MongoCollection; 5 5 import com.mongodb.client.MongoDatabase; 6 //import com.mongodb.client.MongoIterable; 7 import com.mongodb.BasicDBObject; 6 8 import com.mongodb.MongoClient; 7 import com.mongodb.MongoCredential; 9 import com.mongodb.MongoCredential; 10 import com.mongodb.ServerAddress; 11 import com.mongodb.MongoClientOptions; 8 12 9 13 import org.bson.Document; … … 12 16 import java.io.File; 13 17 import java.io.FileReader; 18 import java.util.ArrayList; 19 import java.util.List; 14 20 import java.util.Properties; 21 15 22 16 23 import org.apache.log4j.Logger; … … 43 50 * 44 51 */ 45 public class MongoDBAccess {52 public class MongoDBAccess implements AutoCloseable { 46 53 47 54 private static Logger logger = Logger.getLogger(org.greenstone.atea.MongoDBAccess.class.getName()); 48 55 49 56 static final String PROPS_FILENAME = "config.properties"; 50 public static final String DB_NAME = "anupama"; //"ateacrawldata";51 57 public static final String WEBPAGES_COLLECTION = "webpages"; 52 58 public static final String WEBSITES_COLLECTION = "websites"; 53 59 60 // configuration details, some with fallback values 54 61 private String HOST = "localhost"; 55 62 private int PORT = 27017; // mongodb port 56 63 private String USERNAME; 57 64 private String PASSWORD; 58 65 private String DB_NAME ="ateacrawldata"; 66 59 67 private MongoClient mongo = null; 60 68 private MongoDatabase database = null; … … 108 116 */ 109 117 public void connectToDB() throws Exception { 118 110 119 // Creating a Mongo client 111 120 mongo = new MongoClient( HOST, PORT ); … … 117 126 118 127 // Accessing the database 119 database = mongo.getDatabase(DB_NAME);128 this.database = mongo.getDatabase(DB_NAME); 120 129 logger.info("Credentials: "+ credential); 121 } 122 123 124 public void insertWebSiteInfo(int SITE_COUNTER, int siteID, String domainOfSite, 125 int numPages, int numPagesInMRI, int numPagesContainingMRI, 126 /* TODO: String geoLocationCountryCode, boolean miURL */ 127 String siteCrawledTimestamp, String siteCrawlUnfinished, boolean redoCrawl) 130 131 /* 132 MongoCredential credential; 133 credential = MongoCredential.createCredential(USERNAME, DB_NAME, PASSWORD.toCharArray()); 134 logger.info("Credentials: "+ credential); 135 136 // Create our Mongo client 137 mongo = new MongoClient( new ServerAddress(HOST, PORT), credential, new MongoClientOptions.Builder().build()); 138 System.out.println("Connected to the database successfully"); 139 140 this.database = mongo.getDatabase(DB_NAME); 141 */ 142 143 } 144 145 // TODO: which fields should be indexed? 146 147 public void showCollections() { 148 //MongoIterable<String> colls = this.database.listCollectionNames(); 149 for(String coll : this.database.listCollectionNames()) { 150 System.err.println("coll: " + coll); 151 } 152 } 153 154 155 public void insertWebsiteInfo(WebsiteInfo website) 128 156 { 129 157 MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION); 130 Document document = new Document("id", SITE_COUNTER) 131 .append("siteFolderName", siteID) 132 .append("domain", domainOfSite) 133 .append("totalPages", numPages) 134 .append("numPagesInMRI", numPagesInMRI) 135 .append("numPagesContainingMRI", numPagesContainingMRI) 136 .append("siteCrawledTimestamp", siteCrawledTimestamp) 137 .append("siteCrawlUnfinished", siteCrawlUnfinished) 138 .append("redoCrawl", redoCrawl); 158 Document document = new Document("_id", website.id) 159 .append("siteFolderName", website.siteFolderName) 160 .append("domain", website.domain) 161 .append("totalPages", website.totalPages) 162 .append("numPagesWithBodyText", website.countOfWebPagesWithBodyText) 163 .append("numPagesInMRI", website.numPagesInMRI) 164 .append("siteCrawledTimestamp", website.siteCrawledTimestamp) 165 .append("siteCrawlUnfinished", website.siteCrawlUnfinished) 166 .append("redoCrawl", website.redoCrawl); 167 168 document.put("urlContainsLangCodeInpath", website.urlContainsLangCodeInpath); 169 if(website.geoLocationCountryCode != null && !website.geoLocationCountryCode.equals("")) { 170 document.put("countryCode", website.geoLocationCountryCode); 171 } 172 139 173 collection.insertOne(document); 140 System.out.println("website info inserted successfully into " + WEBSITES_COLLECTION); 141 } 142 143 144 public void insertWebPage(int WEBPAGE_COUNTER, int site_id, /* id of websites_collection*/ 145 String url, String charEncoding, String modTime, String fetchTime, 146 boolean isMRI, int totalSentences, int numSentencesInMRI, 147 ArrayList<SentenceInfo> singleSentences, 148 ArrayList<SentenceInfo> overlappingSentences) 174 logger.debug("Website info for " + website.id + "(" + website.siteFolderName + ")" 175 + " inserted successfully into " + WEBSITES_COLLECTION); 176 } 177 178 /* TODO: 179 https://stackoverflow.com/questions/39433775/mongodb-java-inserting-throws-org-bson-codecs-configuration-codecconfigurationex 180 */ 181 public void insertWebpageInfo(WebpageInfo webpage) 149 182 { 150 183 // load the webpages db 'table' … … 152 185 MongoCollection<Document> collection = this.database.getCollection(WEBPAGES_COLLECTION); 153 186 154 Document document = new Document("id", WEBPAGE_COUNTER) 155 .append("siteid", site_id) 156 .append("url", url) 157 .append("charEncoding", charEncoding) 158 .append("modTime", modTime) 159 .append("fetchTime", fetchTime) 160 .append("isMRI", isMRI) 161 .append("totalSentences", totalSentences) 162 .append("numSentencesInMRI", numSentencesInMRI); 163 164 document.put("singleSentences", singleSentences); 165 document.put("overlappingSentences", overlappingSentences); 187 Document document = new Document("_id", webpage.webpageID) 188 .append("siteid", webpage.websiteID) 189 .append("url", webpage.URL) 190 .append("isMRI", webpage.isMRI) 191 .append("totalSentences", webpage.totalSentences) 192 .append("charEncoding", webpage.charEncoding) 193 .append("modTime", webpage.modifiedTime) 194 .append("fetchTime", webpage.fetchTime); 195 196 // DOESN'T WORK, AS EXPECTED, BUT DIDN'T KNOW HOW TO DO IT: 197 //document.put("singleSentences", webpage.singleSentences); 198 //document.put("overlappingSentences", webpage.overlappingSentences); 199 200 // INSTEAD, ARRAY OF OBJECTS TO BE INSERTED AS PER: 201 // https://stackoverflow.com/questions/15371839/how-to-add-an-array-to-a-mongodb-document-using-java 202 List<BasicDBObject> sentencesList = new ArrayList<>(); 203 for(SentenceInfo sentence : webpage.singleSentences) { 204 sentencesList.add(new BasicDBObject("langCode", sentence.langCode)); 205 sentencesList.add(new BasicDBObject("confidence", sentence.confidenceLevel)); 206 sentencesList.add(new BasicDBObject("sentence", sentence)); 207 } 208 document.put("singleSentences", sentencesList); 209 210 List<BasicDBObject> overlappingSentencesList = new ArrayList<>(); 211 for(SentenceInfo sentence : webpage.overlappingSentences) { 212 sentencesList.add(new BasicDBObject("langCode", sentence.langCode)); 213 sentencesList.add(new BasicDBObject("confidence", sentence.confidenceLevel)); 214 sentencesList.add(new BasicDBObject("sentence", sentence)); 215 } 216 document.put("singleSentences", overlappingSentencesList); 217 218 // also put the full text in there 219 document.put("text", webpage.text); 166 220 167 221 collection.insertOne(document); 168 System.out.println("website info inserted successfully into " + WEBPAGES_COLLECTION); 169 } 222 logger.debug("\nwebpage info for " + webpage.webpageID + " inserted successfully into " + WEBPAGES_COLLECTION); 223 } 224 225 /** https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection */ 226 public void close() {} 170 227 171 228 … … 184 241 MongoDBAccess mongodbCon = new MongoDBAccess(); 185 242 mongodbCon.connectToDB(); 186 //mongodbCon.insertDocument(); 187 }catch(Exception e) { 243 mongodbCon.showCollections(); 244 245 } catch(Exception e) { 188 246 e.printStackTrace(); 189 247 } -
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToCSV.java
r33633 r33634 244 244 245 245 if(text.equals("")) { 246 page.addMRILanguageStatus(false);246 //page.addMRILanguageStatus(false); 247 247 continue; 248 248 } … … 250 250 boolean isMRI = maoriTxtDetector.isTextInMaori(text); 251 251 252 page.addMRILanguageStatus(isMRI);252 //page.addMRILanguageStatus(isMRI); 253 253 254 254 -
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/TextDumpPage.java
r33623 r33634 2 2 3 3 import java.io.*; 4 import java.util.ArrayList; 4 5 import java.util.HashMap; 5 6 import java.util.Map; … … 15 16 16 17 private Map<String, String> tuples; 18 19 private boolean isMRI = false; 20 21 boolean DEBUG_MODE = false; 17 22 18 23 public TextDumpPage(String siteID, String unparsedPageDump) { … … 90 95 tuples.put(k.trim(), v.trim()); 91 96 } else { 92 if( NutchTextDumpProcessor.DEBUG_MODE) {97 if(DEBUG_MODE) { 93 98 logger.error("No meta key for meta: " + line); 94 99 logger.error(unparsedPageDump); … … 118 123 119 124 public void debugTuples() { 120 if( NutchTextDumpProcessor.DEBUG_MODE) {125 if(DEBUG_MODE) { 121 126 logger.debug("__________________________________________"); 122 127 for(Map.Entry<String, String> entry : tuples.entrySet()) { … … 168 173 } 169 174 175 /** 176 * IMPORTANT: This method deletes the data stored in this TextDumpPage object 177 * after converting relevant fields and parameters to a WebpageInfo object 178 */ 179 public WebpageInfo convertStoredDataToWebpageInfo( 180 long webpageID, int websiteID, boolean isMRI, int totalSentences, 181 ArrayList<SentenceInfo> singleSentences, ArrayList<SentenceInfo> overlappingSentences) 182 { 183 // clear the map, after storing the important (meta)data 184 String pageText = getPageText(); 185 String pageURL = getPageURL(); 186 String charEncoding = getOriginalCharEncoding(); 187 String modifiedTime = getModifiedTime(); 188 String fetchTime = getFetchTime(); 189 190 WebpageInfo webpage = new WebpageInfo(webpageID, websiteID, 191 pageText, pageURL, isMRI, totalSentences, 192 charEncoding, modifiedTime, fetchTime, 193 singleSentences, overlappingSentences); 194 195 tuples.clear(); 196 197 return webpage; 198 } 199 200 201 202 /* 170 203 public void addMRILanguageStatus(boolean status) { 171 204 if(status) { … … 189 222 190 223 } 191 224 */ 192 225 } -
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/TextLanguageDetector.java
r33633 r33634 142 142 } 143 143 144 /** inner class */145 public class SentenceInfo {146 public final double confidenceLevel;147 /** 3 letter lang code */148 public final String langCode;149 public final String sentence;150 151 public SentenceInfo(double confidence, String langCode, String sentence) {152 confidenceLevel = confidence;153 this.langCode = langCode;154 this.sentence = sentence;155 }156 }157 158 144 /** TODO: Is it sensible to use the Maori Language Sentence Model to split the text 159 145 * into sentences? What if the text in any other language or a mix of languages? … … 183 169 double confidence = bestLanguage.getConfidence(); 184 170 185 sentencesList.add(new SentenceInfo(confidence, bestLanguage , sentence));171 sentencesList.add(new SentenceInfo(confidence, bestLanguage.getLang(), sentence)); 186 172 } 187 173 … … 207 193 separator = " "; 208 194 } 209 sentence = sentence + separator + sentence [i];195 sentence = sentence + separator + sentences[i]; 210 196 211 197 //System.err.println(sentence); … … 214 200 double confidence = bestLanguage.getConfidence(); 215 201 216 sentencesList.add(new SentenceInfo(confidence, bestLanguage , sentence));202 sentencesList.add(new SentenceInfo(confidence, bestLanguage.getLang(), sentence)); 217 203 } 218 204
Note:
See TracChangeset
for help on using the changeset viewer.