Changeset 33587 for gs3-extensions
- Timestamp:
- 2019-10-18T23:16:25+13:00 (5 years ago)
- Location:
- gs3-extensions/maori-lang-detection/src/org/greenstone/atea
- Files:
-
- 4 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/MRIWebPageStats.java
r33582 r33587 11 11 public final String URL; // URL of webpage 12 12 public final int pageID; // index into NutchTextDumpProcessor::pages ArrayList 13 14 public final boolean isMRI; 15 public final int numSentences; // count of all sentences in the webpage's body 16 public final int numSentencesInMRI; // count of sentences in the webpage's body in MÄori (mri) 17 13 18 14 public MRIWebPageStats(String siteID, String url, int pageID) { 19 public MRIWebPageStats(String siteID, String url, int pageID, boolean isMRI, 20 int numSentences, int numSentencesInMRI) 21 { 15 22 this.siteID = siteID; 16 23 this.URL = url; 17 24 this.pageID = pageID; 25 26 this.isMRI = isMRI; 27 this.numSentences = numSentences; 28 this.numSentencesInMRI = numSentencesInMRI; 29 } 30 31 public String toString() { 32 StringBuilder str = new StringBuilder(); 33 str.append("URL: " + this.URL); 34 str.append("\nsiteID: " + this.siteID); 35 str.append("\nnum sentences in MRI: " + this.numSentencesInMRI+"/"+this.numSentences); 36 if(this.isMRI && this.numSentencesInMRI <= 0) { 37 // one or more pages in the site were MRI, but they didn't contain proper sentences 38 str.append(" (no PROPER sentences in MRI)"); 39 } 40 return str.toString(); 18 41 } 19 42 } -
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/MaoriTextDetector.java
r33586 r33587 69 69 * @param text: the string of text from which sentences in the requested 70 70 * language are to be identified and returned. 71 * @return an ArrayList of sentences in the text parameter that are 72 * in the requested language. 73 */ 74 public ArrayList<String> getAllSentencesInMaori(String text) throws Exception { 71 * @return an ArrayList where: 72 * - the first element is the total number of sentences in the text parameter 73 * - remaining elements are the sentences in the text parameter that were in the 74 * requested language. 75 */ 76 public ArrayList<String> getAllSentencesInMaori(String text) { 75 77 // big assumption here: that we can split incoming text into sentences 76 78 // for any language (using the MÄori language trained sentence model), -
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java
r33582 r33587 2 2 3 3 import java.io.*; 4 import java.lang.ArrayIndexOutOfBoundsException; 4 5 import java.util.ArrayList; 5 //import java.util.HashMap; 6 //import java.util.Map; 7 import java.lang.ArrayIndexOutOfBoundsException; 6 import java.util.Arrays; 8 7 9 8 import org.apache.log4j.Logger; 10 9 11 10 /** 12 * Class to process the dump text files produced for each site(e.g. site "00001") that11 * Class to process the dump text files produced FOR EACH SITE (e.g. site "00001") that 13 12 * Nutch has finished crawling and whose text has been dumped out to a file called dump.txt. 14 13 * This reads in the dump.txt file contained in each site folder within the input folder. … … 59 58 private ArrayList<TextDumpPage> pages; 60 59 60 /** list of pages in this site which were detected as being in MRI */ 61 61 private ArrayList<MRIWebPageStats> pagesInMRI; 62 62 /** list of pages in this site which were NOT detected as being in MRI but nevertheless 63 * contain one or more sentences in MRI 64 */ 65 private ArrayList<MRIWebPageStats> pagesContainingMRI; 66 63 67 private boolean isStartOfNewWebPageRecord(String prevLine, String line) { 64 68 // The start of a new web page's record in nutch's text dump of an entire site … … 147 151 } 148 152 149 // Just do this once: get domain of site.153 // Just do this once: get and store domain of site. 150 154 // Passing true to get domain with protocol prefix 151 155 if(pages.size() > 0) { 152 TextDumpPage firstPage = pages.get(0); 153 String url = firstPage.getPageURL(); 154 this.domainOfSite = CCWETProcessor.getDomainForURL(url, true); 156 TextDumpPage firstPage = pages.get(0); 157 String url = firstPage.getPageURL(); 158 this.domainOfSite = CCWETProcessor.getDomainForURL(url, true); 155 159 } 156 160 else { … … 195 199 private void prepareSiteStats() { 196 200 pagesInMRI = new ArrayList<MRIWebPageStats>(); 197 201 pagesContainingMRI = new ArrayList<MRIWebPageStats>(); 198 202 199 203 TextDumpPage page = null; 200 204 for(int i = 0; i < pages.size(); i++) { 201 202 205 203 206 page = pages.get(i); 204 205 /*206 // just do this once: get domain. Passing true to get domain with protocol prefix207 if(this.domainOfSite == null) {208 this.domainOfSite = CCWETProcessor.getDomainForURL(url, true);209 }210 */211 207 212 208 String text = page.getPageText(); … … 217 213 else { 218 214 boolean isMRI = maoriTxtDetector.isTextInMaori(text); 215 219 216 page.addMRILanguageStatus(isMRI); 220 217 221 if(isMRI) { // add page to list of meaningful pages. 218 // Even if the entire page is not found to be overall in MÄori, 219 // let's sitll inspect the sentences of the page and count how many (if any) 220 // are in te reo. 221 ArrayList<String> mriSentences = maoriTxtDetector.getAllSentencesInMaori(text); 222 // first element is always total num sentences 223 // remaining elements are the actual sentences that were detected as being MÄori 224 int totalSentences = Integer.parseInt(mriSentences.get(0)); 225 int numSentencesInMRI = mriSentences.size() - 1; 226 227 228 // Add page to list of MRI pages if the page's body text overall was detected 229 // as MÄori 230 // Add page to list of pages containing MRI if >= 1 sentences in the page 231 // were detected as being in MRI 232 if(isMRI || numSentencesInMRI >= 1) { 222 233 String url = page.getPageURL(); 223 MRIWebPageStats MRIpageStats = new MRIWebPageStats(this.siteID, url, i); 224 pagesInMRI.add(MRIpageStats); 225 } 226 234 MRIWebPageStats MRIpageStats = new MRIWebPageStats(this.siteID, url, i, isMRI, 235 totalSentences, numSentencesInMRI); 236 if(isMRI) { 237 pagesInMRI.add(MRIpageStats); 238 } else if(numSentencesInMRI >= 1) { 239 pagesContainingMRI.add(MRIpageStats); 240 } 241 242 } 227 243 } 228 244 } … … 237 253 info("Total number of web pages in site: " + pages.size()); 238 254 info("Of these, the number of pages in MÄori (mri) were: " + this.pagesInMRI.size()); 239 240 info("The following were the pages detected by OpenNLP as being in MÄori with " + maoriTxtDetector.MINIMUM_CONFIDENCE + " confidence"); 241 for(MRIWebPageStats mriWebPageInfo : pagesInMRI) { 242 info("URL: " + mriWebPageInfo.URL); 243 info("siteID: " + mriWebPageInfo.siteID); 255 256 if(pagesInMRI.size() > 0) { 257 info("The following were the pages detected by OpenNLP as being in MÄori with " + maoriTxtDetector.MINIMUM_CONFIDENCE + " confidence"); 258 for(MRIWebPageStats mriWebPageInfo : pagesInMRI) { 259 info(mriWebPageInfo.toString()); 260 } 261 } 262 263 info(" ----------- "); 264 if(pagesContainingMRI.size() > 0) { 265 info("The following pages weren't detected as primarily being in MÄori"); 266 info("But still contained sentences detected as MÄori"); 267 for(MRIWebPageStats mriWebPageInfo : pagesContainingMRI) { 268 info(mriWebPageInfo.toString()); 269 } 270 271 } else { 272 info("No further pages detected as containing any sentences in MRI"); 244 273 } 245 274 info(" ----------- "); … … 292 321 MaoriTextDetector mriTxtDetector = new MaoriTextDetector(true); // true: run silent 293 322 File[] sites = sitesDir.listFiles(); 323 324 // sort site folders in alphabetical order 325 // https://stackoverflow.com/questions/7199911/how-to-file-listfiles-in-alphabetical-order 326 Arrays.sort(sites); 327 294 328 for(File siteDir : sites) { // e.g. 00001 295 329 if(siteDir.isDirectory()) { -
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/TextLanguageDetector.java
r33586 r33587 154 154 * for that sentence. The confidence cutoff provides an additional check. 155 155 * @return null if no Sentence Detection Model set up in constructor 156 * else returns an ArrayList of sentences in the text parameter that are 157 * in the requested language. 156 * else returns an ArrayList where: 157 * - the first element is the total number of sentences in the text parameter 158 * - remaining elements are the sentences in the text parameter that were in the 159 * requested language. 158 160 */ 159 161 public ArrayList<String> getAllSentencesInLanguage(String langCode, String text, double confidenceCutoff) … … 180 182 181 183 String[] sentences = sentenceDetector.sentDetect(text); 184 if(sentences == null) { 185 sentencesInLang.add("0"); // to indicate 0 sentences in requested language 186 return sentencesInLang; 187 } 188 189 // add in first element: how many sentences there were in text. 190 sentencesInLang.add(Integer.toString(sentences.length)); 182 191 183 192 for(int i = 0; i < sentences.length; i++) {
Note:
See TracChangeset
for help on using the changeset viewer.