Changeset 33468
- Timestamp:
- 2019-09-13T19:24:27+12:00 (5 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java
r33467 r33468 21 21 * 22 22 * e.g. 23 * - java -cp ".:../conf" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 24 * - java -cp ".:../conf" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 25 * - java -cp ".:../conf" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 2>&1 | less 23 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 24 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 2>&1 | less 26 25 * 27 26 */ … … 55 54 private static final int MIN_CONTENT_LENGTH_WRAPPED_LINE = Integer.parseInt(configProperties.getProperty("WETprocessor.min.content.length.wrapped.line", "500")); 56 55 private static final int MIN_SPACES_IN_A_WRAPPED_LINE = Integer.parseInt(configProperties.getProperty("WETprocessor.min.spaces.per.wrapped.line", "10")); 56 57 // File paths shared across WETProcessor instances 58 private static File discardFolder; 59 private static File keepFolder; 60 private static File keepURLsFile; 61 private static File discardURLsFile; 62 63 // WARC WET header lines and header line prefixes of interest 64 static final String WARC_RECORD_START = "WARC/1.0"; 65 static final String WARC_INFO_HEADER = "WARC-Type: warcinfo"; 66 static final String WARC_TARGET_URI_HEADER_PREFIX = "WARC-Target-URI:"; 67 static final String WARC_CONTENT_LENGTH_HEADER_PREFIX = "Content-Length:"; 57 68 58 69 // Keep a count of all the records that all WETProcessors instantiated … … 76 87 String line = null; 77 88 boolean readingRecord = false; 89 90 String WARCtargetURI = ""; 78 91 79 92 int recordCount = 0; … … 92 105 fileID = fileID.substring(0, fileID.indexOf(".")); 93 106 this.WETFileID = fileID; 94 107 108 95 109 // read from WETfile 96 try (BufferedReader reader = new BufferedReader(new FileReader(inFile))) { 110 try ( 111 BufferedReader reader = new BufferedReader(new FileReader(inFile)); 112 BufferedWriter keepURLsWriter = new BufferedWriter(new FileWriter(keepURLsFile, true)); 113 BufferedWriter discardURLsWriter = new BufferedWriter(new FileWriter(discardURLsFile, true)); // true to append 114 ) { 97 115 98 116 while((line = reader.readLine()) != null) { // readLine removes newline separator 99 117 100 if(line.startsWith( "WARC-Type: warcinfo")) {118 if(line.startsWith(WARC_INFO_HEADER)) { // "WARC-Type: warcinfo" 101 119 readingRecord = false; 102 120 record = null; // drop this record, which is just an info record not actual web page's text … … 105 123 } 106 124 107 if(line.startsWith( "WARC/1.0")) { //finished previous WET record125 if(line.startsWith(WARC_RECORD_START)) { // "WARC/1.0" means finished previous WET record 108 126 // process any previous record 109 127 if(record != null) { 110 processWETrecord(recordCount, contentLength, lineCount, record.toString()); 128 processWETrecord(keepURLsWriter, discardURLsWriter, 129 recordCount, contentLength, lineCount, 130 WARCtargetURI, record.toString()); 111 131 record = null; 112 132 contentLength = -1; … … 122 142 if(readingRecord) { // append current line to current record 123 143 144 if(line.startsWith(WARC_TARGET_URI_HEADER_PREFIX)) { // "WARC-Target-URI:" 145 // get and store the value 146 WARCtargetURI = line.substring(WARC_TARGET_URI_HEADER_PREFIX.length()).trim(); 147 } 148 124 149 record.append(line + "\n"); // add back (unix style) line ending 125 150 … … 132 157 lineCount++; 133 158 } 134 else if(line.startsWith( "Content-Length:")) {135 String contentLengthStr = line.substring( "Content-Length:".length()).trim();159 else if(line.startsWith(WARC_CONTENT_LENGTH_HEADER_PREFIX)) { // "Content-Length:" 160 String contentLengthStr = line.substring(WARC_CONTENT_LENGTH_HEADER_PREFIX.length()).trim(); 136 161 contentLength = Integer.parseInt(contentLengthStr); 137 162 lineCount = 0; … … 144 169 // flush the last record. If it was a warcinfo record, record would be null here 145 170 if(record != null) { 146 processWETrecord(recordCount, contentLength, lineCount, record.toString()); 171 processWETrecord(keepURLsWriter, discardURLsWriter, 172 recordCount, contentLength, lineCount, 173 WARCtargetURI, record.toString()); 147 174 record = null; 148 175 } … … 156 183 * Determines if a WET record belongs in the keep or discard pile depending on if it 157 184 * contains enough text, based on contentLength and line count of the record body. 158 * Then writes out the WET record to a uniquely named file in the keep or discard folder. 185 * Then writes out the WET record to a uniquely named file in the keep or discard folder, 186 * and writes out the WET record's URL to the keepURLs.txt file or discardURLs.txt file. 159 187 */ 160 private void processWETrecord(int recordID, int contentLength, int lineCount, String record) 188 private void processWETrecord(BufferedWriter keepURLsWriter, BufferedWriter discardURLsWriter, 189 int recordID, int contentLength, int lineCount, 190 String recordURI, String record) 161 191 { 162 192 System.err.println("WET #" + this.WETFileID + " record #" + recordID 163 193 + " - contentLength: " + contentLength 164 194 + " - lineCount: " + lineCount); 195 System.err.println("URI: " + recordURI); 165 196 //System.err.println(record); 166 197 //System.err.println("--------------------------"); … … 168 199 String paddedFileName = String.format("%04d.txt", recordID); 169 200 170 File discardFolder = new File(this.outputFolder, "discard");171 File keepFolder = new File(this.outputFolder, "keep");172 201 File parentFolder = null; 173 202 174 203 if(lineCount >= MIN_LINE_COUNT && contentLength >= MIN_CONTENT_LENGTH) { 175 parentFolder = keepFolder;204 parentFolder = WETProcessor.keepFolder; 176 205 System.err.println("@@@KEEPING"); 177 206 } else if(contentLength >= MIN_CONTENT_LENGTH_WRAPPED_LINE) { … … 183 212 // So we have at least 500 chars (possibly on a single wrapped line) 184 213 // containing at least 10 spaces. Such a record is also worth keeping. 185 parentFolder = keepFolder;214 parentFolder = WETProcessor.keepFolder; 186 215 } 187 216 } … … 189 218 // if parentFolder still not set, set to discard pile folder 190 219 if(parentFolder == null) { 191 parentFolder = discardFolder;220 parentFolder = WETProcessor.discardFolder; 192 221 System.err.println("@@@DISCARDING"); 193 222 } 194 223 224 try { 225 if (parentFolder == WETProcessor.keepFolder) { 226 keepURLsWriter.write(recordURI + "\n"); 227 } else { 228 discardURLsWriter.write(recordURI + "\n"); 229 } 230 } catch(Exception e) { 231 System.err.println("Unable to write URL"); 232 e.printStackTrace(); 233 } 234 195 235 System.err.println("--------------------------"); 196 236 … … 265 305 return; 266 306 } 267 268 File discardFolder = new File(outFolder, "discard"); 269 if(!discardFolder.exists()) { 270 discardFolder.mkdir(); 307 308 // static folders and files to be shared across all WETProcessor instances 309 WETProcessor.discardFolder = new File(outFolder, "discard"); 310 if(!WETProcessor.discardFolder.exists()) { 311 WETProcessor.discardFolder.mkdir(); 271 312 } 272 File keepFolder = new File(outFolder, "keep"); 273 if(!keepFolder.exists()) { 274 keepFolder.mkdir(); 275 } 276 313 WETProcessor.keepFolder = new File(outFolder, "keep"); 314 if(!WETProcessor.keepFolder.exists()) { 315 WETProcessor.keepFolder.mkdir(); 316 } 317 318 WETProcessor.keepURLsFile = new File(outFolder, "keepURLs.txt"); 319 if(WETProcessor.keepURLsFile.exists() && !WETProcessor.keepURLsFile.delete()) { 320 System.err.println("Unable to delete " + WETProcessor.keepURLsFile + ". Unable to proceed."); 321 return; 322 } 323 WETProcessor.discardURLsFile = new File(outFolder, "discardURLs.txt"); 324 if(WETProcessor.discardURLsFile.exists() && !WETProcessor.discardURLsFile.delete()) { 325 System.err.println("Unable to delete " + WETProcessor.discardURLsFile + ". Unable to proceed."); 326 return; 327 } 328 277 329 // Will list all the warc.wet files in the input directory or else their gzipped versions 278 330 File[] WETFiles = WETFileDir.listFiles(new WETFilenameFilter());
Note:
See TracChangeset
for help on using the changeset viewer.