Changeset 33468


Ignore:
Timestamp:
2019-09-13T19:24:27+12:00 (5 years ago)
Author:
ak19
Message:

More meaningful to (also) write out the keep vs discard URLs into keep and discard .txt files, than just write out the WET records. That way, we have URLs to start downloading sites from (after removing duplicates/URLs of the same domain).

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java

    r33467 r33468  
    2121 *
    2222 * e.g.
    23  *    - java -cp ".:../conf" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET
    24  *    - java -cp ".:../conf" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET
    25  *    - java -cp ".:../conf" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 2>&1 | less
     23 *    - java -cp ".:../conf:../lib/*" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET
     24 *    - java -cp ".:../conf:../lib/*" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 2>&1 | less
    2625 *
    2726*/
     
    5554    private static final int MIN_CONTENT_LENGTH_WRAPPED_LINE = Integer.parseInt(configProperties.getProperty("WETprocessor.min.content.length.wrapped.line", "500"));
    5655    private static final int MIN_SPACES_IN_A_WRAPPED_LINE = Integer.parseInt(configProperties.getProperty("WETprocessor.min.spaces.per.wrapped.line", "10"));
     56
     57    // File paths shared across WETProcessor instances
     58    private static File discardFolder;
     59    private static File keepFolder;
     60    private static File keepURLsFile;
     61    private static File discardURLsFile;
     62
     63    // WARC WET header lines and header line prefixes of interest
     64    static final String WARC_RECORD_START = "WARC/1.0";
     65    static final String WARC_INFO_HEADER = "WARC-Type: warcinfo";
     66    static final String WARC_TARGET_URI_HEADER_PREFIX = "WARC-Target-URI:";
     67    static final String WARC_CONTENT_LENGTH_HEADER_PREFIX = "Content-Length:";
    5768   
    5869    // Keep a count of all the records that all WETProcessors instantiated
     
    7687    String line = null;
    7788    boolean readingRecord = false;
     89
     90    String WARCtargetURI = "";
    7891   
    7992    int recordCount = 0;
     
    92105    fileID = fileID.substring(0, fileID.indexOf("."));
    93106    this.WETFileID = fileID;
    94    
     107
     108       
    95109    // read from WETfile
    96     try (BufferedReader reader = new BufferedReader(new FileReader(inFile))) {
     110    try (
     111         BufferedReader reader = new BufferedReader(new FileReader(inFile));         
     112         BufferedWriter keepURLsWriter = new BufferedWriter(new FileWriter(keepURLsFile, true));
     113         BufferedWriter discardURLsWriter = new BufferedWriter(new FileWriter(discardURLsFile, true)); // true to append
     114         ) {
    97115       
    98116        while((line = reader.readLine()) != null) { // readLine removes newline separator
    99117
    100         if(line.startsWith("WARC-Type: warcinfo")) {
     118        if(line.startsWith(WARC_INFO_HEADER)) { // "WARC-Type: warcinfo"
    101119            readingRecord = false;
    102120            record = null; // drop this record, which is just an info record not actual web page's text
     
    105123        }
    106124       
    107         if(line.startsWith("WARC/1.0")) { // finished previous WET record
     125        if(line.startsWith(WARC_RECORD_START)) { // "WARC/1.0" means finished previous WET record
    108126            // process any previous record
    109127            if(record != null) {
    110             processWETrecord(recordCount, contentLength, lineCount, record.toString());
     128            processWETrecord(keepURLsWriter, discardURLsWriter,
     129                     recordCount, contentLength, lineCount,
     130                     WARCtargetURI, record.toString());
    111131            record = null;
    112132            contentLength = -1;
     
    122142        if(readingRecord) { // append current line to current record
    123143
     144            if(line.startsWith(WARC_TARGET_URI_HEADER_PREFIX)) { // "WARC-Target-URI:"
     145            // get and store the value
     146            WARCtargetURI = line.substring(WARC_TARGET_URI_HEADER_PREFIX.length()).trim();
     147            }
     148           
    124149            record.append(line + "\n"); // add back (unix style) line ending
    125150
     
    132157            lineCount++;
    133158            }
    134             else if(line.startsWith("Content-Length:")) {
    135             String contentLengthStr = line.substring("Content-Length:".length()).trim();
     159            else if(line.startsWith(WARC_CONTENT_LENGTH_HEADER_PREFIX)) { // "Content-Length:"
     160            String contentLengthStr = line.substring(WARC_CONTENT_LENGTH_HEADER_PREFIX.length()).trim();
    136161            contentLength = Integer.parseInt(contentLengthStr);
    137162            lineCount = 0;
     
    144169        // flush the last record. If it was a warcinfo record, record would be null here
    145170        if(record != null) {
    146         processWETrecord(recordCount, contentLength, lineCount, record.toString());
     171        processWETrecord(keepURLsWriter, discardURLsWriter,
     172                 recordCount, contentLength, lineCount,
     173                 WARCtargetURI, record.toString());
    147174        record = null;
    148175        }
     
    156183     * Determines if a WET record belongs in the keep or discard pile depending on if it
    157184     * contains enough text, based on contentLength and line count of the record body.
    158      * Then writes out the WET record to a uniquely named file in the keep or discard folder.
     185     * Then writes out the WET record to a uniquely named file in the keep or discard folder,
     186     * and writes out the WET record's URL to the keepURLs.txt file or discardURLs.txt file.
    159187     */
    160     private void processWETrecord(int recordID, int contentLength, int lineCount, String record)
     188    private void processWETrecord(BufferedWriter keepURLsWriter, BufferedWriter discardURLsWriter,
     189                  int recordID, int contentLength, int lineCount,
     190                  String recordURI, String record)
    161191    {
    162192    System.err.println("WET #" + this.WETFileID + " record #" + recordID
    163193               + " - contentLength: " + contentLength
    164194               + " - lineCount: " + lineCount);
     195    System.err.println("URI: " + recordURI);
    165196    //System.err.println(record);
    166197    //System.err.println("--------------------------");
     
    168199    String paddedFileName = String.format("%04d.txt", recordID);
    169200   
    170     File discardFolder = new File(this.outputFolder, "discard");       
    171     File keepFolder = new File(this.outputFolder, "keep");
    172201    File parentFolder = null;
    173202   
    174203    if(lineCount >= MIN_LINE_COUNT && contentLength >= MIN_CONTENT_LENGTH) {
    175         parentFolder = keepFolder;
     204        parentFolder = WETProcessor.keepFolder;
    176205        System.err.println("@@@KEEPING");
    177206    } else if(contentLength >= MIN_CONTENT_LENGTH_WRAPPED_LINE) {
     
    183212        // So we have at least 500 chars (possibly on a single wrapped line)
    184213        // containing at least 10 spaces. Such a record is also worth keeping.
    185         parentFolder = keepFolder;
     214        parentFolder = WETProcessor.keepFolder;
    186215        }
    187216    }
     
    189218    // if parentFolder still not set, set to discard pile folder
    190219    if(parentFolder == null) {
    191         parentFolder = discardFolder;
     220        parentFolder = WETProcessor.discardFolder;
    192221        System.err.println("@@@DISCARDING");
    193222    }
    194223
     224    try {
     225        if (parentFolder == WETProcessor.keepFolder) {
     226        keepURLsWriter.write(recordURI + "\n");
     227        } else {
     228        discardURLsWriter.write(recordURI + "\n");
     229        }
     230    } catch(Exception e) {
     231        System.err.println("Unable to write URL");
     232        e.printStackTrace();
     233    }
     234   
    195235    System.err.println("--------------------------");
    196236   
     
    265305        return;
    266306    }   
    267    
    268     File discardFolder = new File(outFolder, "discard");
    269     if(!discardFolder.exists()) {
    270         discardFolder.mkdir();
     307
     308    // static folders and files to be shared across all WETProcessor instances
     309    WETProcessor.discardFolder = new File(outFolder, "discard");
     310    if(!WETProcessor.discardFolder.exists()) {
     311        WETProcessor.discardFolder.mkdir();
    271312    }   
    272     File keepFolder = new File(outFolder, "keep");
    273     if(!keepFolder.exists()) {
    274         keepFolder.mkdir();
    275     }
    276 
     313    WETProcessor.keepFolder = new File(outFolder, "keep");
     314    if(!WETProcessor.keepFolder.exists()) {
     315        WETProcessor.keepFolder.mkdir();
     316    }
     317
     318    WETProcessor.keepURLsFile = new File(outFolder, "keepURLs.txt");   
     319    if(WETProcessor.keepURLsFile.exists() && !WETProcessor.keepURLsFile.delete()) {
     320        System.err.println("Unable to delete " + WETProcessor.keepURLsFile + ". Unable to proceed.");
     321        return;
     322    }
     323    WETProcessor.discardURLsFile = new File(outFolder, "discardURLs.txt");
     324    if(WETProcessor.discardURLsFile.exists() && !WETProcessor.discardURLsFile.delete()) {
     325        System.err.println("Unable to delete " + WETProcessor.discardURLsFile + ". Unable to proceed.");
     326        return;
     327    }
     328   
    277329    // Will list all the warc.wet files in the input directory or else their gzipped versions
    278330    File[] WETFiles = WETFileDir.listFiles(new WETFilenameFilter());
Note: See TracChangeset for help on using the changeset viewer.