Changeset 33582 for gs3-extensions
- Timestamp:
- 2019-10-17T23:12:38+13:00 (5 years ago)
- Location:
- gs3-extensions/maori-lang-detection/src/org/greenstone/atea
- Files:
-
- 1 added
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java
r33575 r33582 240 240 * This retains any www. or subdomain prefix. 241 241 */ 242 p rivateString getDomainForURL(String url, boolean withProtocol) {242 public static String getDomainForURL(String url, boolean withProtocol) { 243 243 int startIndex = startIndex = url.indexOf("//"); // for http:// or https:// prefix 244 244 startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion -
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java
r33580 r33582 42 42 * java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor ../crawled 43 43 * 44 * or: 45 * java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor ../crawled > ../crawled/bla.txt 2>&1 46 * 44 47 */ 45 48 public class NutchTextDumpProcessor { 46 49 private static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpProcessor.class.getName()); 47 50 51 static boolean DEBUG_MODE = true; 52 48 53 private final MaoriTextDetector maoriTxtDetector; 49 54 50 55 public final String siteID; // is this necessary? 51 56 private String domainOfSite; 57 52 58 /** keep a list to store the text of each page */ 53 59 private ArrayList<TextDumpPage> pages; 60 61 private ArrayList<MRIWebPageStats> pagesInMRI; 54 62 55 63 private boolean isStartOfNewWebPageRecord(String prevLine, String line) { … … 62 70 63 71 public void debugPageDump(StringBuilder pageDump) { 64 65 // START DEBUG 66 debug("__________________________________________"); 67 debug("@@@ Found page entry: "); 68 debug("__________________________________________"); 69 debug(pageDump.toString()); 70 debug("------------------------------------------"); 71 // END DEBUG 72 if(DEBUG_MODE) { 73 // START DEBUG 74 debug("__________________________________________"); 75 debug("@@@ Found page entry: "); 76 debug("__________________________________________"); 77 debug(pageDump.toString()); 78 debug("------------------------------------------"); 79 // END DEBUG 80 } 72 81 } 73 82 … … 137 146 error("@@@@@@@@@ Error reading in nutch txtdump file " + txtDumpFile, ioe); 138 147 } 139 148 149 // Just do this once: get domain of site. 150 // Passing true to get domain with protocol prefix 151 if(pages.size() > 0) { 152 TextDumpPage firstPage = pages.get(0); 153 String url = firstPage.getPageURL(); 154 this.domainOfSite = CCWETProcessor.getDomainForURL(url, true); 155 } 156 else { 157 this.domainOfSite = "UNKNOWN"; 158 } 159 160 prepareSiteStats(); 140 161 } 141 162 … … 144 165 145 166 String text = getTextForPage(pageID); 167 146 168 // QTODO: what to do when page body text is empty? 147 169 if(text.equals("")) return false; … … 165 187 TextDumpPage page = getPage(pageID); 166 188 return page.getPageURL(); 167 } 168 189 } 190 191 public int totalNumPages() { 192 return pages.size(); 193 } 194 195 private void prepareSiteStats() { 196 pagesInMRI = new ArrayList<MRIWebPageStats>(); 197 198 199 TextDumpPage page = null; 200 for(int i = 0; i < pages.size(); i++) { 201 202 203 page = pages.get(i); 204 205 /* 206 // just do this once: get domain. Passing true to get domain with protocol prefix 207 if(this.domainOfSite == null) { 208 this.domainOfSite = CCWETProcessor.getDomainForURL(url, true); 209 } 210 */ 211 212 String text = page.getPageText(); 213 if(text.equals("")) { 214 page.addMRILanguageStatus(false); 215 continue; 216 } 217 else { 218 boolean isMRI = maoriTxtDetector.isTextInMaori(text); 219 page.addMRILanguageStatus(isMRI); 220 221 if(isMRI) { // add page to list of meaningful pages. 222 String url = page.getPageURL(); 223 MRIWebPageStats MRIpageStats = new MRIWebPageStats(this.siteID, url, i); 224 pagesInMRI.add(MRIpageStats); 225 } 226 227 } 228 } 229 } 230 231 public void printSiteStats() { 232 233 234 info("------------- " + this.siteID + " SITE STATS -----------"); 235 236 info("SITE DOMAIN: " + this.domainOfSite); 237 info("Total number of web pages in site: " + pages.size()); 238 info("Of these, the number of pages in MÄori (mri) were: " + this.pagesInMRI.size()); 239 240 info("The following were the pages detected by OpenNLP as being in MÄori with " + maoriTxtDetector.MINIMUM_CONFIDENCE + " confidence"); 241 for(MRIWebPageStats mriWebPageInfo : pagesInMRI) { 242 info("URL: " + mriWebPageInfo.URL); 243 info("siteID: " + mriWebPageInfo.siteID); 244 } 245 info(" ----------- "); 246 } 247 169 248 170 249 // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- // … … 208 287 } 209 288 289 NutchTextDumpProcessor.DEBUG_MODE = false; 290 210 291 try { 211 MaoriTextDetector mriTxtDetector = new MaoriTextDetector( false); // false: run non-silent292 MaoriTextDetector mriTxtDetector = new MaoriTextDetector(true); // true: run silent 212 293 File[] sites = sitesDir.listFiles(); 213 294 for(File siteDir : sites) { // e.g. 00001 … … 223 304 String siteID = siteDir.getName(); 224 305 debug("Found siteID: " + siteID); 225 NutchTextDumpProcessor nutchTxtDump = new NutchTextDumpProcessor(mriTxtDetector, siteID, txtDumpFile); 306 NutchTextDumpProcessor nutchTxtDump = new NutchTextDumpProcessor(mriTxtDetector, siteID, txtDumpFile); 307 // now it's parsed all the web pages in the site's text dump 308 309 // Let's print stats on each web page's detected language being MRI or not 310 // and how many pages there were in the site in total. 311 312 nutchTxtDump.printSiteStats(); 226 313 } 227 314 } -
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/TextDumpPage.java
r33580 r33582 77 77 } 78 78 else { // dealing with the rest of the page dump's metadata 79 // QTODO: nutch's text dump output is problematic 80 // strange characters are in the stream and end up here 81 // and can't detect end of metadata or even end of line. 79 82 endIndex = line.indexOf(":"); 80 83 if(endIndex != -1) { … … 83 86 tuples.put(k.trim(), v.trim()); 84 87 } else { 85 error("No meta key for meta: " + line); 88 if(NutchTextDumpProcessor.DEBUG_MODE) { 89 error("No meta key for meta: " + line); 90 error(unparsedPageDump); 91 } 86 92 } 87 93 } … … 102 108 103 109 // START DEBUG 104 debug("__________________________________________"); 105 for(Map.Entry<String, String> entry : tuples.entrySet()) { 106 String key = entry.getKey(); 107 String value = entry.getValue(); 108 debug(key + " - " + value); 109 } 110 debug("__________________________________________"); 110 debugTuples(); 111 111 // END DEBUG 112 112 113 113 } 114 114 115 public void debugTuples() { 116 if(NutchTextDumpProcessor.DEBUG_MODE) { 117 debug("__________________________________________"); 118 for(Map.Entry<String, String> entry : tuples.entrySet()) { 119 String key = entry.getKey(); 120 String value = entry.getValue(); 121 debug(key + " - " + value); 122 } 123 debug("__________________________________________"); 124 } 125 } 126 115 127 116 128 public String getPageURL() { 117 return tuples.get(" url");129 return tuples.get("pageURL"); 118 130 } 119 131 … … 126 138 } 127 139 140 public void add(String key, String value) { 141 tuples.put(key, value); 142 } 143 144 public void addMRILanguageStatus(boolean status) { 145 if(status) { 146 tuples.put("isMRI", "true"); 147 } else { 148 tuples.put("isMRI", "false"); 149 } 150 } 151 152 public boolean getMRILanguageStatus() { 153 String value = tuples.get("isMRI"); 154 if(value == null) { 155 return false; 156 } 157 if(value.equals("true")) { 158 return true; 159 } 160 else { 161 return false; 162 } 163 164 } 165 128 166 // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- // 129 167 public static void info(String msg) {
Note:
See TracChangeset
for help on using the changeset viewer.