Changeset 33633 for gs3-extensions
- Timestamp:
- 2019-11-08T19:43:39+13:00 (4 years ago)
- Location:
- gs3-extensions/maori-lang-detection/src/org/greenstone/atea
- Files:
-
- 2 edited
- 1 moved
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java
r33626 r33633 47 47 private static Logger logger = Logger.getLogger(org.greenstone.atea.MongoDBAccess.class.getName()); 48 48 49 String HOST = "localhost";50 int PORT = 27017; // mongodb port51 String PROPS_FILENAME = "config.properties";52 String DB_NAME = "ateacrawldata";49 static final String PROPS_FILENAME = "config.properties"; 50 public static final String DB_NAME = "anupama"; //"ateacrawldata"; 51 public static final String WEBPAGES_COLLECTION = "webpages"; 52 public static final String WEBSITES_COLLECTION = "websites"; 53 53 54 private String HOST = "localhost"; 55 private int PORT = 27017; // mongodb port 54 56 private String USERNAME; 55 private String PASSWORD; 56 57 private String PASSWORD; 57 58 58 59 private MongoClient mongo = null; 59 60 private MongoDatabase database = null; 61 60 62 61 63 public MongoDBAccess() throws Exception { … … 118 120 logger.info("Credentials: "+ credential); 119 121 } 122 120 123 124 public void insertWebSiteInfo(int SITE_COUNTER, int siteID, String domainOfSite, 125 int numPages, int numPagesInMRI, int numPagesContainingMRI, 126 /* TODO: String geoLocationCountryCode, boolean miURL */ 127 String siteCrawledTimestamp, String siteCrawlUnfinished, boolean redoCrawl) 128 { 129 MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION); 130 Document document = new Document("id", SITE_COUNTER) 131 .append("siteFolderName", siteID) 132 .append("domain", domainOfSite) 133 .append("totalPages", numPages) 134 .append("numPagesInMRI", numPagesInMRI) 135 .append("numPagesContainingMRI", numPagesContainingMRI) 136 .append("siteCrawledTimestamp", siteCrawledTimestamp) 137 .append("siteCrawlUnfinished", siteCrawlUnfinished) 138 .append("redoCrawl", redoCrawl); 139 collection.insertOne(document); 140 System.out.println("website info inserted successfully into " + WEBSITES_COLLECTION); 141 } 121 142 122 /* 123 public void insertDocument() { 124 MongoCollection<Document> collection = this.database.getCollection("sampleCollection"); 143 144 public void insertWebPage(int WEBPAGE_COUNTER, int site_id, /* id of websites_collection*/ 145 String url, String charEncoding, String modTime, String fetchTime, 146 boolean isMRI, int totalSentences, int numSentencesInMRI, 147 ArrayList<SentenceInfo> singleSentences, 148 ArrayList<SentenceInfo> overlappingSentences) 149 { 150 // load the webpages db 'table' 151 // in mongodb, the equivalent of db tables are called 'collections' 152 MongoCollection<Document> collection = this.database.getCollection(WEBPAGES_COLLECTION); 153 154 Document document = new Document("id", WEBPAGE_COUNTER) 155 .append("siteid", site_id) 156 .append("url", url) 157 .append("charEncoding", charEncoding) 158 .append("modTime", modTime) 159 .append("fetchTime", fetchTime) 160 .append("isMRI", isMRI) 161 .append("totalSentences", totalSentences) 162 .append("numSentencesInMRI", numSentencesInMRI); 163 164 document.put("singleSentences", singleSentences); 165 document.put("overlappingSentences", overlappingSentences); 166 167 collection.insertOne(document); 168 System.out.println("website info inserted successfully into " + WEBPAGES_COLLECTION); 125 169 } 126 */170 127 171 128 172 // TODO: -
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToCSV.java
r33623 r33633 38 38 * TO COMPILE: 39 39 * maori-lang-detection/src$ 40 * javac -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDump Processor.java40 * javac -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToCSV.java 41 41 * 42 42 * TO RUN: 43 43 * maori-lang-detection/src$ 44 * java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDump Processor../crawled-small44 * java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToCSV ../crawled-small 45 45 * 46 46 * or: 47 * java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDump Processor../crawled-small > ../crawled-small/bla.txt 2>&147 * java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToCSV ../crawled-small > ../crawled-small/bla.txt 2>&1 48 48 * 49 49 */ 50 public class NutchTextDump Processor{51 static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDump Processor.class.getName());50 public class NutchTextDumpToCSV { 51 static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpToCSV.class.getName()); 52 52 53 53 static boolean DEBUG_MODE = true; 54 54 55 55 /** Counter for number of sites. 56 * Should be equal to number of times NutchTextDump Processorconstructor56 * Should be equal to number of times NutchTextDumpToCSV constructor 57 57 * is called: once per site. 58 58 */ … … 66 66 public final boolean siteCrawlUnfinished; 67 67 public final long siteCrawledTimestamp; /** When the crawl of the site terminated */ 68 68 69 69 private String domainOfSite; 70 70 … … 100 100 } 101 101 102 /** A NutchTextDump Processorprocesses the dump.txt for one site */103 public NutchTextDump Processor(CSVPrinter webpagesCSVPrinter, CSVPrinter mriSentencesCSVPrinter,102 /** A NutchTextDumpToCSV processes the dump.txt for one site */ 103 public NutchTextDumpToCSV(CSVPrinter webpagesCSVPrinter, CSVPrinter mriSentencesCSVPrinter, 104 104 MaoriTextDetector maoriTxtDetector, String siteID, 105 105 File txtDumpFile, long lastModified, boolean siteCrawlUnfinished) 106 106 throws IOException 107 107 { 108 // increment static counter of sites processed by a NutchTextDump Processorinstance108 // increment static counter of sites processed by a NutchTextDumpToCSV instance 109 109 SITE_COUNTER++; 110 110 … … 190 190 prepareSiteStats(webpagesCSVPrinter, mriSentencesCSVPrinter); 191 191 } 192 192 193 /* UNUSED */ 193 194 /** pageID: id into pages array */ 195 /* 194 196 public boolean isPageInMaori(int pageID) throws ArrayIndexOutOfBoundsException { 195 197 … … 228 230 return pagesContainingMRI.size(); 229 231 } 232 */ 230 233 231 234 private void prepareSiteStats(CSVPrinter webpageCSVPrinter, CSVPrinter mriSentencesCSVPrinter) throws IOException { … … 257 260 // remaining elements are the actual sentences that were detected as being MÄori 258 261 int totalSentences = Integer.parseInt(mriSentences.get(0)); 259 int numSentencesInMRI = mriSentences.size() - 1; 262 int numSentencesInMRI = mriSentences.size() - 1; 260 263 261 264 // Add page to list of MRI pages if the page's body text overall was detected … … 367 370 public static void printUsage() { 368 371 System.err.println("Run this program as:"); 369 System.err.println("\tNutchTextDump Processor<path to 'crawled' folder>");372 System.err.println("\tNutchTextDumpToCSV <path to 'crawled' folder>"); 370 373 } 371 374 … … 382 385 } 383 386 384 NutchTextDump Processor.DEBUG_MODE = false;387 NutchTextDumpToCSV.DEBUG_MODE = false; 385 388 386 389 File websitesCSVFile = new File(sitesDir, "websites.csv"); … … 426 429 long lastModified = siteDir.lastModified(); 427 430 logger.debug("Found siteID: " + siteID); 428 NutchTextDump Processor nutchTxtDump = new NutchTextDumpProcessor(431 NutchTextDumpToCSV nutchTxtDump = new NutchTextDumpToCSV( 429 432 webpagesCSVPrinter, mriSentencesCSVPrinter, mriTxtDetector, 430 433 siteID, txtDumpFile, lastModified, UNFINISHED_FILE.exists()); … … 442 445 443 446 } catch(Exception e) { 444 // can get an exception when instantiating NutchTextDump Processorinstance447 // can get an exception when instantiating NutchTextDumpToCSV instance 445 448 // or with CSV file 446 449 logger.error(e.getMessage(), e); -
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/TextLanguageDetector.java
r33587 r33633 142 142 } 143 143 144 /** inner class */ 145 public class SentenceInfo { 146 public final double confidenceLevel; 147 /** 3 letter lang code */ 148 public final String langCode; 149 public final String sentence; 150 151 public SentenceInfo(double confidence, String langCode, String sentence) { 152 confidenceLevel = confidence; 153 this.langCode = langCode; 154 this.sentence = sentence; 155 } 156 } 157 158 /** TODO: Is it sensible to use the Maori Language Sentence Model to split the text 159 * into sentences? What if the text in any other language or a mix of languages? 160 * Doesn't this assume that all languages split sentences alike? */ 161 public String[] getAllSentences(String text) { 162 163 // This function doesn't work if the sentenceDetector object wasn't set up 164 if(sentenceDetector == null) return null; 165 166 String[] sentences = sentenceDetector.sentDetect(text); 167 return sentences; 168 } 169 170 public ArrayList<SentenceInfo> getAllSentencesInfo(String[] sentences) { 171 172 if(sentences == null) { 173 return null; 174 } 175 176 ArrayList<SentenceInfo> sentencesList = new ArrayList<SentenceInfo>(); 177 for(int i = 0; i < sentences.length; i++) { 178 String sentence = sentences[i]; 179 180 //System.err.println(sentence); 181 182 Language bestLanguage = myCategorizer.predictLanguage(sentence); 183 double confidence = bestLanguage.getConfidence(); 184 185 sentencesList.add(new SentenceInfo(confidence, bestLanguage, sentence)); 186 } 187 188 return sentencesList; 189 } 190 191 public ArrayList<SentenceInfo> getAllOverlappingSentencesInfo(String[] sentences) { 192 193 if(sentences == null) { 194 return null; 195 } 196 197 ArrayList<SentenceInfo> sentencesList = new ArrayList<SentenceInfo>(); 198 for(int i = 1; i < sentences.length; i++) { 199 // glue every two adjacent sentences together 200 String sentence = sentences[i-1]; 201 202 String separator = ". "; 203 // if the sentence already ends with a terminating punctuation character, 204 // then separator is just a space 205 sentence = sentence.trim(); 206 if(sentence.endsWith(".") || sentence.endsWith("?") || sentence.endsWith("!")) { 207 separator = " "; 208 } 209 sentence = sentence + separator + sentence[i]; 210 211 //System.err.println(sentence); 212 213 Language bestLanguage = myCategorizer.predictLanguage(sentence); 214 double confidence = bestLanguage.getConfidence(); 215 216 sentencesList.add(new SentenceInfo(confidence, bestLanguage, sentence)); 217 } 218 219 return sentencesList; 220 } 221 144 222 /** 145 223 * In this class' constructor, need to have set up the Sentence Detection Model … … 269 347 if(i == NUM_LINES) { // arbitrary 100 lines read, predict language, calculate confidence 270 348 271 272 349 Language bestLanguage = myCategorizer.predictLanguage(text.toString()); 273 350 if(language != null && !bestLanguage.getLang().equals(language)) { // predicted lang of current n lines not the same as predicted lang for prev n lines
Note:
See TracChangeset
for help on using the changeset viewer.