- Timestamp:
- 2019-09-23T23:16:28+12:00 (5 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java
r33501 r33503 12 12 13 13 /** 14 * The main() method of this class takes a folder of warc.wet(.gz) files and goes through15 * the WET records in each, putting each WET record into a file. Each file is put into a16 * keep or discard folder, based on content-length and number of lines.17 * A single instance of the WETProcessor class processes a single unzipped warc.wet file.14 * A single instance of the WETProcessor class can process a single unzipped warc.wet file. 15 * A WETProcessor take a warc.wet file and goes through all its WET records, 16 * putting each WET record into a file. Each file is put into a keep, discard or greylisted folder 17 * and its url listed written into a keep, discard or greylisted text file, based on: 18 18 * 19 * To compile, including the jars in lib/ for compiling. 20 * maori-lang-detection/src$ javac -cp ".:../lib/*" org/greenstone/atea/WETProcessor.java 19 * 1. whether it's whitelisted, else greylisted else blacklisted 20 * 2. and if explicitly whitelisted or else not greylisted or blacklisted and there's 21 * enough content. Formerly, content-length and number of lines were used to determine if 22 * the content was sufficient. Now it's just word count and number of MAX characters 23 * (not MINIMUM characters) that determine a string is a word. 24 * Explicit whitelisting has precedence over greylisting and which takes precedence 25 * over blacklisting in turn. 26 * However, even explicitly whitelisted urls still need to have sufficient content to end 27 * up in keepURLs.txt. 21 28 * 22 * To run, passing the log4j and other properties files in conf/ folder: 23 * maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.WETProcessor <folder containing warc.wet(.gz) files> <outputFolder> 24 * 25 * e.g. 26 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 27 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 2>&1 | less 29 * See CCWETProcessor.java for compile instructions and how to run. 28 30 * 29 31 */ … … 72 74 public int processWETFile() { 73 75 File keepURLsFile = this.batchProcessor.keepURLsFile; 74 File discardURLsFile = this.batchProcessor.discardURLsFile; 76 File discardURLsFile = this.batchProcessor.discardURLsFile; 77 File greyListedFile = this.batchProcessor.greyListedFile; 75 78 76 79 StringBuilder record = null; … … 90 93 BufferedWriter keepURLsWriter = new BufferedWriter(new FileWriter(keepURLsFile, true)); 91 94 BufferedWriter discardURLsWriter = new BufferedWriter(new FileWriter(discardURLsFile, true)); // true to append 95 BufferedWriter greyListedURLsWriter = new BufferedWriter(new FileWriter(greyListedFile, true)); // true to append 92 96 ) { 93 97 … … 104 108 // process any previous record 105 109 if(record != null) { 106 processWETrecord(keepURLsWriter, discardURLsWriter, 110 processWETrecord(keepURLsWriter, discardURLsWriter, greyListedURLsWriter, 107 111 recordCount, contentLength, lineCount, 108 112 WARCtargetURI, record.toString()); … … 147 151 // flush the last record. If it was a warcinfo record, record would be null here 148 152 if(record != null) { 149 processWETrecord(keepURLsWriter, discardURLsWriter, 153 processWETrecord(keepURLsWriter, discardURLsWriter, greyListedURLsWriter, 150 154 recordCount, contentLength, lineCount, 151 155 WARCtargetURI, record.toString()); … … 169 173 */ 170 174 private void processWETrecord(BufferedWriter keepURLsWriter, BufferedWriter discardURLsWriter, 175 BufferedWriter greyListedURLsWriter, 171 176 int recordID, int contentLength, int lineCount, 172 177 String recordURI, String record) … … 210 215 } 211 216 */ 212 217 213 218 if(batchProcessor.isBlacklisted(recordURI)) { 214 parentFolder = batchProcessor.discardFolder; 219 220 221 // explicit whitelisting overrides blacklisting 222 if(batchProcessor.isWhitelisted(recordURI)) { 223 parentFolder = batchProcessor.keepFolder; //tentative 224 } 225 // if not whitelisted, then greylisting overrides blacklisting 226 else if(batchProcessor.isGreylisted(recordURI)) { 227 parentFolder = batchProcessor.greyListedFolder; 228 System.err.println("@@@GREYLISTED"); 229 } 230 else { // only blacklisted 231 parentFolder = batchProcessor.discardFolder; 232 System.err.println("@@@DISCARDING - blacklisted"); 233 } 215 234 } 216 235 else if(batchProcessor.isGreylisted(recordURI)) { // e.g. products sites 217 parentFolder = batchProcessor.discardFolder; // TODO: checkfolder 218 } else { 236 // explicit whitelisting overrides greylisting 237 if(batchProcessor.isWhitelisted(recordURI)) { 238 parentFolder = batchProcessor.keepFolder; // tentative 239 } 240 else { 241 parentFolder = batchProcessor.greyListedFolder; 242 System.err.println("@@@GREYLISTED"); 243 } 244 } 245 246 // If URL was not blacklisted/greylisted, or was even explicitly whitelisted, 247 // it still can't be in the keep list as it needs further inspection: 248 // it needs sufficient content for language analysis. 249 if(parentFolder != batchProcessor.greyListedFolder && parentFolder != batchProcessor.discardFolder) { // i.e. parentFolder == keepFolder if whiteListed || parentFolder == null 250 219 251 // If a web page's WET record contains a certain minimum number of words, 220 252 // we will think it's a meaningful web page and has sufficient content for text analysis … … 254 286 } 255 287 } 256 // if parentFolder still not set, set to discard pile folder 288 // if parentFolder still not set, it means that the content length/num words or lines 289 // were insufficient, so meant to be discarded 257 290 if(parentFolder == null) { 258 291 parentFolder = batchProcessor.discardFolder; … … 263 296 if (parentFolder == batchProcessor.keepFolder) { 264 297 keepURLsWriter.write(recordURI + "\n"); 298 } else if (parentFolder == batchProcessor.greyListedFolder) { 299 greyListedURLsWriter.write(recordURI + "\n"); 265 300 } else { 266 301 discardURLsWriter.write(recordURI + "\n");
Note:
See TracChangeset
for help on using the changeset viewer.