Changeset 33468

Show
Ignore:
Timestamp:
13.09.2019 19:24:27 (5 weeks ago)
Author:
ak19
Message:

More meaningful to (also) write out the keep vs discard URLs into keep and discard .txt files, than just write out the WET records. That way, we have URLs to start downloading sites from (after removing duplicates/URLs of the same domain).

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java

    r33467 r33468  
    2121 * 
    2222 * e.g. 
    23  *    - java -cp ".:../conf" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 
    24  *    - java -cp ".:../conf" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 
    25  *    - java -cp ".:../conf" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 2>&1 | less 
     23 *    - java -cp ".:../conf:../lib/*" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 
     24 *    - java -cp ".:../conf:../lib/*" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 2>&1 | less 
    2625 * 
    2726*/ 
     
    5554    private static final int MIN_CONTENT_LENGTH_WRAPPED_LINE = Integer.parseInt(configProperties.getProperty("WETprocessor.min.content.length.wrapped.line", "500")); 
    5655    private static final int MIN_SPACES_IN_A_WRAPPED_LINE = Integer.parseInt(configProperties.getProperty("WETprocessor.min.spaces.per.wrapped.line", "10")); 
     56 
     57    // File paths shared across WETProcessor instances 
     58    private static File discardFolder; 
     59    private static File keepFolder; 
     60    private static File keepURLsFile; 
     61    private static File discardURLsFile; 
     62 
     63    // WARC WET header lines and header line prefixes of interest 
     64    static final String WARC_RECORD_START = "WARC/1.0"; 
     65    static final String WARC_INFO_HEADER = "WARC-Type: warcinfo"; 
     66    static final String WARC_TARGET_URI_HEADER_PREFIX = "WARC-Target-URI:"; 
     67    static final String WARC_CONTENT_LENGTH_HEADER_PREFIX = "Content-Length:"; 
    5768     
    5869    // Keep a count of all the records that all WETProcessors instantiated 
     
    7687    String line = null;  
    7788    boolean readingRecord = false; 
     89 
     90    String WARCtargetURI = ""; 
    7891     
    7992    int recordCount = 0; 
     
    92105    fileID = fileID.substring(0, fileID.indexOf(".")); 
    93106    this.WETFileID = fileID; 
    94      
     107 
     108         
    95109    // read from WETfile 
    96     try (BufferedReader reader = new BufferedReader(new FileReader(inFile))) { 
     110    try ( 
     111         BufferedReader reader = new BufferedReader(new FileReader(inFile));          
     112         BufferedWriter keepURLsWriter = new BufferedWriter(new FileWriter(keepURLsFile, true)); 
     113         BufferedWriter discardURLsWriter = new BufferedWriter(new FileWriter(discardURLsFile, true)); // true to append 
     114         ) { 
    97115         
    98116        while((line = reader.readLine()) != null) { // readLine removes newline separator 
    99117 
    100         if(line.startsWith("WARC-Type: warcinfo")) { 
     118        if(line.startsWith(WARC_INFO_HEADER)) { // "WARC-Type: warcinfo" 
    101119            readingRecord = false; 
    102120            record = null; // drop this record, which is just an info record not actual web page's text 
     
    105123        } 
    106124         
    107         if(line.startsWith("WARC/1.0")) { // finished previous WET record 
     125        if(line.startsWith(WARC_RECORD_START)) { // "WARC/1.0" means finished previous WET record 
    108126            // process any previous record 
    109127            if(record != null) { 
    110             processWETrecord(recordCount, contentLength, lineCount, record.toString()); 
     128            processWETrecord(keepURLsWriter, discardURLsWriter, 
     129                     recordCount, contentLength, lineCount, 
     130                     WARCtargetURI, record.toString()); 
    111131            record = null; 
    112132            contentLength = -1; 
     
    122142        if(readingRecord) { // append current line to current record 
    123143 
     144            if(line.startsWith(WARC_TARGET_URI_HEADER_PREFIX)) { // "WARC-Target-URI:" 
     145            // get and store the value 
     146            WARCtargetURI = line.substring(WARC_TARGET_URI_HEADER_PREFIX.length()).trim(); 
     147            } 
     148             
    124149            record.append(line + "\n"); // add back (unix style) line ending 
    125150 
     
    132157            lineCount++; 
    133158            } 
    134             else if(line.startsWith("Content-Length:")) { 
    135             String contentLengthStr = line.substring("Content-Length:".length()).trim(); 
     159            else if(line.startsWith(WARC_CONTENT_LENGTH_HEADER_PREFIX)) { // "Content-Length:" 
     160            String contentLengthStr = line.substring(WARC_CONTENT_LENGTH_HEADER_PREFIX.length()).trim(); 
    136161            contentLength = Integer.parseInt(contentLengthStr); 
    137162            lineCount = 0; 
     
    144169        // flush the last record. If it was a warcinfo record, record would be null here 
    145170        if(record != null) { 
    146         processWETrecord(recordCount, contentLength, lineCount, record.toString()); 
     171        processWETrecord(keepURLsWriter, discardURLsWriter, 
     172                 recordCount, contentLength, lineCount, 
     173                 WARCtargetURI, record.toString()); 
    147174        record = null; 
    148175        } 
     
    156183     * Determines if a WET record belongs in the keep or discard pile depending on if it 
    157184     * contains enough text, based on contentLength and line count of the record body. 
    158      * Then writes out the WET record to a uniquely named file in the keep or discard folder. 
     185     * Then writes out the WET record to a uniquely named file in the keep or discard folder, 
     186     * and writes out the WET record's URL to the keepURLs.txt file or discardURLs.txt file. 
    159187     */ 
    160     private void processWETrecord(int recordID, int contentLength, int lineCount, String record) 
     188    private void processWETrecord(BufferedWriter keepURLsWriter, BufferedWriter discardURLsWriter, 
     189                  int recordID, int contentLength, int lineCount, 
     190                  String recordURI, String record) 
    161191    { 
    162192    System.err.println("WET #" + this.WETFileID + " record #" + recordID 
    163193               + " - contentLength: " + contentLength 
    164194               + " - lineCount: " + lineCount); 
     195    System.err.println("URI: " + recordURI); 
    165196    //System.err.println(record); 
    166197    //System.err.println("--------------------------"); 
     
    168199    String paddedFileName = String.format("%04d.txt", recordID); 
    169200     
    170     File discardFolder = new File(this.outputFolder, "discard");         
    171     File keepFolder = new File(this.outputFolder, "keep"); 
    172201    File parentFolder = null; 
    173202     
    174203    if(lineCount >= MIN_LINE_COUNT && contentLength >= MIN_CONTENT_LENGTH) { 
    175         parentFolder = keepFolder; 
     204        parentFolder = WETProcessor.keepFolder; 
    176205        System.err.println("@@@KEEPING"); 
    177206    } else if(contentLength >= MIN_CONTENT_LENGTH_WRAPPED_LINE) { 
     
    183212        // So we have at least 500 chars (possibly on a single wrapped line) 
    184213        // containing at least 10 spaces. Such a record is also worth keeping. 
    185         parentFolder = keepFolder; 
     214        parentFolder = WETProcessor.keepFolder; 
    186215        } 
    187216    } 
     
    189218    // if parentFolder still not set, set to discard pile folder 
    190219    if(parentFolder == null) { 
    191         parentFolder = discardFolder; 
     220        parentFolder = WETProcessor.discardFolder; 
    192221        System.err.println("@@@DISCARDING"); 
    193222    } 
    194223 
     224    try { 
     225        if (parentFolder == WETProcessor.keepFolder) { 
     226        keepURLsWriter.write(recordURI + "\n"); 
     227        } else { 
     228        discardURLsWriter.write(recordURI + "\n"); 
     229        } 
     230    } catch(Exception e) { 
     231        System.err.println("Unable to write URL"); 
     232        e.printStackTrace(); 
     233    } 
     234    
    195235    System.err.println("--------------------------"); 
    196236     
     
    265305        return; 
    266306    }    
    267      
    268     File discardFolder = new File(outFolder, "discard"); 
    269     if(!discardFolder.exists()) { 
    270         discardFolder.mkdir(); 
     307 
     308    // static folders and files to be shared across all WETProcessor instances 
     309    WETProcessor.discardFolder = new File(outFolder, "discard"); 
     310    if(!WETProcessor.discardFolder.exists()) { 
     311        WETProcessor.discardFolder.mkdir(); 
    271312    }    
    272     File keepFolder = new File(outFolder, "keep"); 
    273     if(!keepFolder.exists()) { 
    274         keepFolder.mkdir(); 
    275     } 
    276  
     313    WETProcessor.keepFolder = new File(outFolder, "keep"); 
     314    if(!WETProcessor.keepFolder.exists()) { 
     315        WETProcessor.keepFolder.mkdir(); 
     316    } 
     317 
     318    WETProcessor.keepURLsFile = new File(outFolder, "keepURLs.txt");     
     319    if(WETProcessor.keepURLsFile.exists() && !WETProcessor.keepURLsFile.delete()) { 
     320        System.err.println("Unable to delete " + WETProcessor.keepURLsFile + ". Unable to proceed."); 
     321        return; 
     322    } 
     323    WETProcessor.discardURLsFile = new File(outFolder, "discardURLs.txt"); 
     324    if(WETProcessor.discardURLsFile.exists() && !WETProcessor.discardURLsFile.delete()) { 
     325        System.err.println("Unable to delete " + WETProcessor.discardURLsFile + ". Unable to proceed."); 
     326        return; 
     327    } 
     328     
    277329    // Will list all the warc.wet files in the input directory or else their gzipped versions 
    278330    File[] WETFiles = WETFileDir.listFiles(new WETFilenameFilter());