Changeset 33503 for gs3-extensions
- Timestamp:
- 2019-09-23T23:16:28+12:00 (5 years ago)
- Location:
- gs3-extensions/maori-lang-detection/src/org/greenstone/atea
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java
r33501 r33503 6 6 import java.util.zip.GZIPInputStream; 7 7 import java.util.Iterator; 8 import java.util.HashMap; 9 import java.util.Map; 8 10 import java.util.Set; 9 11 import java.util.TreeSet; … … 14 16 * The main() method of this class takes a folder of warc.wet(.gz) files and goes through 15 17 * the WET records in each, putting each WET record into a file. Each file is put into a 16 * keep or discard folder, based on content-length and number of lines. 17 * A single instance of the WETProcessor class processes a single unzipped warc.wet file. 18 * keep or discard or greyListed folder, and its url listed written into a keep, discard 19 * or greylisted text file, based on based on 20 * 21 * 1. whether it's whitelisted, else greylisted else blacklisted 22 * 2. and if explicitly whitelisted or else not greylisted or blacklisted and there's 23 * enough content. Formerly, content-length and number of lines were used to determine if 24 * the content was sufficient. Now it's just word count and number of MAX characters 25 * (not MINIMUM characters) that determine a string is a word. These settings can be adjusted 26 * in conf/config.properties. 27 * 28 * Put a url-blacklist-filter.txt and/or url-greylist-filter.txt and/or url-whitelist-filter.txt 29 * into the conf folder to control any url patterns that are explicitly included or excluded or 30 * set aside for inspecting later. These filter text files don't use regexes, instead their 31 * format is: 32 * - precede URL by ^ to blacklist urls that match the given prefix 33 * - succeed URL by $ to blacklist urls that match the given suffix 34 * - ^url$ will blacklist urls that match the given url completely 35 * - Without either ^ or $ symbol, urls containing the given url will get blacklisted 36 * 37 * WETProcessor.java's current implementation is that explicit whitelisting has precedence 38 * over greylisting and which takes precedence over blacklisting in turn. However, even 39 * explicitly whitelisted urls still need to have sufficient content to end up in keepURLs.txt 40 * and in the seedURLs.txt file used for nutch, along with its domain in regex-urlfilter.txt 41 * also for nutch. 42 * 43 * A CCWETProcessor instance can be configured to process all the .warc.wet(.gz) files 44 * in the given input folder. Then use a single instance of the WETProcessor class to process 45 * each single unzipped warc.wet file. 18 46 * 19 47 * To compile, including the jars in lib/ for compiling. … … 44 72 public final File discardFolder; 45 73 public final File keepFolder; 74 public final File greyListedFolder; 46 75 public final File keepURLsFile; 47 76 public final File discardURLsFile; 77 public final File greyListedFile; 78 79 private final Integer LIST_ENTRY_CONTAINS = new Integer(0); 80 private final Integer LIST_ENTRY_STARTSWITH = new Integer(1); 81 private final Integer LIST_ENTRY_ENDSWITH = new Integer(2); 82 private final Integer LIST_ENTRY_MATCHES = new Integer(3); 83 84 private HashMap<String, Integer> blackList; 85 private HashMap<String, Integer> greyList; 86 private HashMap<String, Integer> whiteList; 48 87 49 88 // Keep a count of all the records that all WETProcessors instantiated … … 53 92 private int wetFileCount = 0; 54 93 55 public CCWETProcessor(File inFolder, File outFolder) {94 public CCWETProcessor(File inFolder, File outFolder) throws Exception { 56 95 this.WETFilesDir = inFolder; 57 96 this.outputFolder = outFolder; 58 97 59 98 // load up the properties from the config file 60 try (InputStream infile = org.greenstone.atea. WETProcessor.class.getClassLoader().getResourceAsStream("config.properties")) {99 try (InputStream infile = org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream("config.properties")) { 61 100 configProperties = new Properties(); 62 101 configProperties.load(infile); 63 //infile.close(); 102 //infile.close(); // not explicitly called in examples of try-with-resources 64 103 65 104 } catch(Exception e) { … … 86 125 keepFolder.mkdir(); 87 126 } 88 127 128 this.greyListedFolder = new File(outFolder, "greylisted"); 129 if(!greyListedFolder.exists()) { 130 greyListedFolder.mkdir(); 131 } 132 89 133 this.keepURLsFile = new File(outFolder, "keepURLs.txt"); 90 134 if(keepURLsFile.exists() && !keepURLsFile.delete()) { 91 System.err.println("Warning: Unable to delete " + this.keepURLsFile + ". Unable to proceed."); 92 //return; 135 throw new Exception("Warning: Unable to delete " + this.keepURLsFile + ". Unable to proceed."); 93 136 } 94 137 this.discardURLsFile = new File(outFolder, "discardURLs.txt"); 95 138 if(discardURLsFile.exists() && !discardURLsFile.delete()) { 96 System.err.println("Warning Unable to delete " + discardURLsFile + ". Unable to proceed."); 97 //return; 98 } 99 139 throw new Exception ("Warning Unable to delete " + discardURLsFile + ". Unable to proceed."); 140 } 141 this.greyListedFile = new File(outFolder, "greyListed.txt"); 142 if(greyListedFile.exists() && !greyListedFile.delete()) { 143 throw new Exception ("Warning Unable to delete " + greyListedFile + ". Unable to proceed."); 144 } 145 146 System.err.println("Loading blacklist."); 147 blackList = new HashMap<String, Integer>(); 148 initURLFilterList(blackList, "url-blacklist-filter.txt"); 149 System.err.println("Loading greylist."); 150 greyList = new HashMap<String, Integer>(); 151 initURLFilterList(greyList, "url-greylist-filter.txt"); 152 System.err.println("Loading whitelist."); 153 whiteList = new HashMap<String, Integer>(); 154 initURLFilterList(whiteList, "url-whitelist-filter.txt"); 155 156 //System.err.println("Prematurely terminating for testing purposes."); 157 //System.exit(-1); 100 158 } 101 159 … … 171 229 } 172 230 173 /* 231 private boolean isListedInFilterList(Map<String, Integer> filterListMap, String url) { 232 Set<Map.Entry<String,Integer>> entries = filterListMap.entrySet(); 233 Iterator<Map.Entry<String, Integer>> i = entries.iterator(); 234 while(i.hasNext()) { 235 Map.Entry<String, Integer> entry = i.next(); 236 String urlPattern = entry.getKey(); 237 Integer matchRule = entry.getValue(); 238 239 if(matchRule == LIST_ENTRY_CONTAINS && url.contains(urlPattern)) { 240 return true; 241 } 242 else if(matchRule == LIST_ENTRY_STARTSWITH && url.startsWith(urlPattern)) { 243 return true; 244 } 245 else if(matchRule == LIST_ENTRY_ENDSWITH && url.endsWith(urlPattern)) { 246 return true; 247 } 248 else if(matchRule == LIST_ENTRY_MATCHES && url.equals(urlPattern)) { 249 return true; 250 } 251 // else check the rest of the filter list against this url 252 // before returning false to be certain it's not been listed in the filter list 253 } 254 255 return false; 256 } 257 258 /** 259 * Returns true if the url or pattern is found in the blacklist file. 260 * Note that if eventually the same url pattern is found in the greylist or whitelist too, 261 * it won't get blacklisted after all. But that's not implemented here. 262 */ 174 263 public boolean isBlacklisted(String url) { 175 return false; 176 } 177 */ 178 264 return isListedInFilterList(blackList, url); 265 } 266 267 /** 268 * Returns true if the url or pattern is explicitly mentioned in the greylist file. 269 * Will eventually take precedence over if the same URL pattern was mentioned in the blacklist. 270 * Will eventually be pre-empted into the whitelist if mentioned in the whitelist. 271 */ 179 272 public boolean isGreylisted(String url) { 180 // alexa top sites and auto-translated product sites 181 return false; 273 // TODO: alexa top sites and auto-translated product sites 274 return isListedInFilterList(greyList, url); 275 } 276 277 /** 278 * Returns true if the url or pattern is explicitly mentioned in the whitelist file 279 * Its mention in a whitelist moreover overrides any mention in the blacklist and greylist. 280 */ 281 public boolean isWhitelisted(String url) { 282 return isListedInFilterList(whiteList, url); 182 283 } 183 284 … … 189 290 * we don't have to remember to escape or double escape each filter to turn it into a regex. 190 291 */ 191 //public boolean isInDiscardFilter(String url) { 192 193 public boolean isBlacklisted(String url) { 292 public void initURLFilterList(Map<String, Integer> list, String filterListFilename) { 293 294 // if filterListFilename does not exist in the conf folder, just return 295 if(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResource(filterListFilename) == null) { 296 System.err.println(filterListFilename + " does not exist"); 297 return; 298 } 299 300 try ( 301 BufferedReader reader = new BufferedReader(new InputStreamReader(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream(filterListFilename), "UTF-8")); 302 ) { 303 String filter = null; 304 while((filter = reader.readLine()) != null) { 305 // skip comments and empty lines 306 filter = filter.trim(); 307 if(filter.equals("") || filter.startsWith("#")) { 308 continue; 309 } 310 311 if(filter.startsWith("^") && filter.endsWith("$")) { 312 filter = filter.substring(1, filter.length()-1); 313 list.put(filter, LIST_ENTRY_MATCHES); 314 } 315 else if(filter.startsWith("^")) { 316 filter = filter.substring(1); 317 list.put(filter, LIST_ENTRY_STARTSWITH); 318 System.err.println("Match filter startswith: " + filter); 319 } 320 else if(filter.endsWith("$")) { 321 filter = filter.substring(0, filter.length()-1); 322 list.put(filter, LIST_ENTRY_ENDSWITH); 323 } 324 else { 325 list.put(filter, LIST_ENTRY_CONTAINS); 326 } 327 //System.err.println("Got filter: " + filter); 328 } 329 330 } catch (IOException ioe) { 331 ioe.printStackTrace(); 332 System.err.println("\n@@@@@@@@@ Error reading into map from file " + filterListFilename); 333 } 334 335 } 336 /* 337 public boolean isInDiscardFilter(String url) { 194 338 String discardFilterFile = "url-discard-filter.txt"; // in conf folder 195 339 196 340 try ( 197 BufferedReader reader = new BufferedReader(new InputStreamReader(org.greenstone.atea. WETProcessor.class.getClassLoader().getResourceAsStream(discardFilterFile), "UTF-8"));341 BufferedReader reader = new BufferedReader(new InputStreamReader(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream(discardFilterFile), "UTF-8")); 198 342 ) { 199 343 String filter = null; … … 227 371 228 372 return false; 229 } 373 }*/ 230 374 231 375 /** Maintain a count of all WET files processed. */ … … 293 437 } 294 438 439 try { 295 440 CCWETProcessor ccWETFilesProcessor = new CCWETProcessor(WETFileDir, outFolder); 296 441 … … 341 486 File urlFilterFile = new File(outFolder, "regex-urlfilter.txt"); 342 487 ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile); 488 } catch(Exception e) { 489 // can get an exception when instantiating CCWETProcessor instance 490 e.printStackTrace(); 491 System.err.println(e.getMessage()); 492 } 343 493 344 494 return; -
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java
r33501 r33503 12 12 13 13 /** 14 * The main() method of this class takes a folder of warc.wet(.gz) files and goes through15 * the WET records in each, putting each WET record into a file. Each file is put into a16 * keep or discard folder, based on content-length and number of lines.17 * A single instance of the WETProcessor class processes a single unzipped warc.wet file.14 * A single instance of the WETProcessor class can process a single unzipped warc.wet file. 15 * A WETProcessor take a warc.wet file and goes through all its WET records, 16 * putting each WET record into a file. Each file is put into a keep, discard or greylisted folder 17 * and its url listed written into a keep, discard or greylisted text file, based on: 18 18 * 19 * To compile, including the jars in lib/ for compiling. 20 * maori-lang-detection/src$ javac -cp ".:../lib/*" org/greenstone/atea/WETProcessor.java 19 * 1. whether it's whitelisted, else greylisted else blacklisted 20 * 2. and if explicitly whitelisted or else not greylisted or blacklisted and there's 21 * enough content. Formerly, content-length and number of lines were used to determine if 22 * the content was sufficient. Now it's just word count and number of MAX characters 23 * (not MINIMUM characters) that determine a string is a word. 24 * Explicit whitelisting has precedence over greylisting and which takes precedence 25 * over blacklisting in turn. 26 * However, even explicitly whitelisted urls still need to have sufficient content to end 27 * up in keepURLs.txt. 21 28 * 22 * To run, passing the log4j and other properties files in conf/ folder: 23 * maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.WETProcessor <folder containing warc.wet(.gz) files> <outputFolder> 24 * 25 * e.g. 26 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 27 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 2>&1 | less 29 * See CCWETProcessor.java for compile instructions and how to run. 28 30 * 29 31 */ … … 72 74 public int processWETFile() { 73 75 File keepURLsFile = this.batchProcessor.keepURLsFile; 74 File discardURLsFile = this.batchProcessor.discardURLsFile; 76 File discardURLsFile = this.batchProcessor.discardURLsFile; 77 File greyListedFile = this.batchProcessor.greyListedFile; 75 78 76 79 StringBuilder record = null; … … 90 93 BufferedWriter keepURLsWriter = new BufferedWriter(new FileWriter(keepURLsFile, true)); 91 94 BufferedWriter discardURLsWriter = new BufferedWriter(new FileWriter(discardURLsFile, true)); // true to append 95 BufferedWriter greyListedURLsWriter = new BufferedWriter(new FileWriter(greyListedFile, true)); // true to append 92 96 ) { 93 97 … … 104 108 // process any previous record 105 109 if(record != null) { 106 processWETrecord(keepURLsWriter, discardURLsWriter, 110 processWETrecord(keepURLsWriter, discardURLsWriter, greyListedURLsWriter, 107 111 recordCount, contentLength, lineCount, 108 112 WARCtargetURI, record.toString()); … … 147 151 // flush the last record. If it was a warcinfo record, record would be null here 148 152 if(record != null) { 149 processWETrecord(keepURLsWriter, discardURLsWriter, 153 processWETrecord(keepURLsWriter, discardURLsWriter, greyListedURLsWriter, 150 154 recordCount, contentLength, lineCount, 151 155 WARCtargetURI, record.toString()); … … 169 173 */ 170 174 private void processWETrecord(BufferedWriter keepURLsWriter, BufferedWriter discardURLsWriter, 175 BufferedWriter greyListedURLsWriter, 171 176 int recordID, int contentLength, int lineCount, 172 177 String recordURI, String record) … … 210 215 } 211 216 */ 212 217 213 218 if(batchProcessor.isBlacklisted(recordURI)) { 214 parentFolder = batchProcessor.discardFolder; 219 220 221 // explicit whitelisting overrides blacklisting 222 if(batchProcessor.isWhitelisted(recordURI)) { 223 parentFolder = batchProcessor.keepFolder; //tentative 224 } 225 // if not whitelisted, then greylisting overrides blacklisting 226 else if(batchProcessor.isGreylisted(recordURI)) { 227 parentFolder = batchProcessor.greyListedFolder; 228 System.err.println("@@@GREYLISTED"); 229 } 230 else { // only blacklisted 231 parentFolder = batchProcessor.discardFolder; 232 System.err.println("@@@DISCARDING - blacklisted"); 233 } 215 234 } 216 235 else if(batchProcessor.isGreylisted(recordURI)) { // e.g. products sites 217 parentFolder = batchProcessor.discardFolder; // TODO: checkfolder 218 } else { 236 // explicit whitelisting overrides greylisting 237 if(batchProcessor.isWhitelisted(recordURI)) { 238 parentFolder = batchProcessor.keepFolder; // tentative 239 } 240 else { 241 parentFolder = batchProcessor.greyListedFolder; 242 System.err.println("@@@GREYLISTED"); 243 } 244 } 245 246 // If URL was not blacklisted/greylisted, or was even explicitly whitelisted, 247 // it still can't be in the keep list as it needs further inspection: 248 // it needs sufficient content for language analysis. 249 if(parentFolder != batchProcessor.greyListedFolder && parentFolder != batchProcessor.discardFolder) { // i.e. parentFolder == keepFolder if whiteListed || parentFolder == null 250 219 251 // If a web page's WET record contains a certain minimum number of words, 220 252 // we will think it's a meaningful web page and has sufficient content for text analysis … … 254 286 } 255 287 } 256 // if parentFolder still not set, set to discard pile folder 288 // if parentFolder still not set, it means that the content length/num words or lines 289 // were insufficient, so meant to be discarded 257 290 if(parentFolder == null) { 258 291 parentFolder = batchProcessor.discardFolder; … … 263 296 if (parentFolder == batchProcessor.keepFolder) { 264 297 keepURLsWriter.write(recordURI + "\n"); 298 } else if (parentFolder == batchProcessor.greyListedFolder) { 299 greyListedURLsWriter.write(recordURI + "\n"); 265 300 } else { 266 301 discardURLsWriter.write(recordURI + "\n");
Note:
See TracChangeset
for help on using the changeset viewer.