package org.greenstone.atea;


import java.io.*;
import java.util.Properties;
import java.util.Iterator;
import java.util.Set;
import java.util.TreeSet;

import org.apache.log4j.Logger;

/**
 * A single instance of the WETProcessor class can process a single unzipped warc.wet file.
 * A WETProcessor take a warc.wet file and goes through all its WET records,
 * putting each WET record into a file. Each file is put into a keep, discard or greylisted folder
 * and its url listed written into a keep, discard or greylisted text file, based on:
 *
 * 1. whether it's whitelisted, else greylisted else blacklisted
 * 2. and if explicitly whitelisted or else not greylisted or blacklisted and there's
 * enough content. Formerly, content-length and number of lines were used to determine if
 * the content was sufficient. Now it's just word count and number of MAX characters
 * (not MINIMUM characters) that determine a string is a word.
 * Explicit whitelisting has precedence over greylisting and which takes precedence
 * over blacklisting in turn.
 * However, even explicitly whitelisted urls still need to have sufficient content to end 
 * up in keepURLs.txt.
 *
 * See CCWETProcessor.java for compile instructions and how to run.
 *
*/
public class WETProcessor {
    private static Logger logger = Logger.getLogger(org.greenstone.atea.WETProcessor.class.getName());

    // WARC WET header lines and header line prefixes of interest
    static final String WARC_RECORD_START = "WARC/1.0";
    static final String WARC_INFO_HEADER = "WARC-Type: warcinfo";
    static final String WARC_TARGET_URI_HEADER_PREFIX = "WARC-Target-URI:";
    static final String WARC_CONTENT_LENGTH_HEADER_PREFIX = "Content-Length:";    

    private final String crawlID;
    private final int WETFileID;
    private final File inFile;

    private int recordCount = 0;

    /** Handle to a CCWETProcessor that processes a set of WET files
     * Whereas a WETProcessor instance only processes a single WET file
     * containing multiple WET records.
     */
    private CCWETProcessor batchProcessor;
    
    /**
     * WET processor processes a single warc.wet file containing multiple WET records
     * containing text identified as primary langcode=mri. Each individual WET record is written
     * out to a uniquely named file in either the keep or discard folder depending on the WET
     * record's content length and number of lines of actual content (excluding WARC headers).
     * @param inFile the warc.wet file whose WET records are to be processed
     * @param crawlID is the ID of the commoncrawl containing this warc.wet file
     * and is of the format YYYY-## (of full crawlID CC-MAIN-YYYY-##) which will be used
     * as prefix to create unique filenames when storing each individual record).
     */
    public WETProcessor(File inFile, String crawlID, CCWETProcessor batchProcessor) {
	this.batchProcessor = batchProcessor;
	
	this.inFile = inFile;
	this.crawlID = crawlID;

	// We just want a unique recordID prefix, which we get from concatenating
	// the commoncrawl ID with the wet file name suffix and record count within the file:
	// inFile name looks something like MAORI-CC-2019-30-20190902100139-000000.warc.wet
	// the prefix will be everything after the last hyphen and without file extension,
	// so "000000" in our example. Then converted into a number and padded to 2, e.g. 00.
	// Then prefix the crawlID and suffix the 4-digit padded recordCount keeping track
	// of the current WET record to get a unique filename to store each WET record into.
	// e.g. 2019-30-00-0015 is the 15th WET record in the *00.warc.wet file of the
	// common crawl CC-MAIN-2019-30 (15th WET record of CC-MAIN-2019-30-*-000000.warc.wet.gz)

	String fileID = inFile.getName();
	//logger.debug("*** Processing wetfile: " + fileID);		    
	fileID = fileID.substring(fileID.lastIndexOf("0")+1);
	if(fileID.startsWith(".")) { // took off too many zeroes, as happens with *000000.warc.wet
	    this.WETFileID = 0;
	} else {
	    fileID = fileID.substring(0, fileID.indexOf("."));
	    this.WETFileID = Integer.parseInt(fileID);
	}
    }

    /**
     * Processes all the WET records of a single warc.wet file
     */
    public int processWETFile() {
	File keepURLsFile = this.batchProcessor.keepURLsFile;
	File discardURLsFile = this.batchProcessor.discardURLsFile;
	File greyListedFile = this.batchProcessor.greyListedFile;
	
	StringBuilder record = null;
	String line = null;	
	boolean readingRecord = false;

	String WARCtargetURI = "";
	
	//int recordCount = 0;
	
	int contentLength = -1; // of record
	int lineCount = -1; // actual number of non-empty lines in record body (i.e. excludes WET/WARC headers)
	    
	// read from WETfile
	try (
	     BufferedReader reader = new BufferedReader(new FileReader(this.inFile));
	     BufferedWriter keepURLsWriter = new BufferedWriter(new FileWriter(keepURLsFile, true));
	     BufferedWriter discardURLsWriter = new BufferedWriter(new FileWriter(discardURLsFile, true)); // true to append
	     BufferedWriter greyListedURLsWriter = new BufferedWriter(new FileWriter(greyListedFile, true)); // true to append
	     ) {
	    
	    while((line = reader.readLine()) != null) { // readLine removes newline separator

		if(line.startsWith(WARC_INFO_HEADER)) { // "WARC-Type: warcinfo"
		    readingRecord = false;
		    record = null; // drop this record, which is just an info record not actual web page's text
		    recordCount--;
		    continue;
		}
		
		if(line.startsWith(WARC_RECORD_START)) { // "WARC/1.0" means finished previous WET record
		    // process any previous record
		    if(record != null) {
			processWETrecord(keepURLsWriter, discardURLsWriter, greyListedURLsWriter,
					 recordCount, contentLength, lineCount,
					 WARCtargetURI, record.toString());
			record = null;
			contentLength = -1;
			lineCount = -1;
		    }
		    
		    recordCount++;		    
		    // get ready to start a new record
		    readingRecord = true;
		    record = new StringBuilder();
		}

		if(readingRecord) { // append current line to current record

		    if(line.startsWith(WARC_TARGET_URI_HEADER_PREFIX)) { // "WARC-Target-URI:"
			// get and store the value
			WARCtargetURI = line.substring(WARC_TARGET_URI_HEADER_PREFIX.length()).trim();
		    }
		    
		    record.append(line + "\n"); // add back (unix style) line ending

		    // if the line is non-empty
		    // AND if we've started counting lines, which happens only when the current
		    // line is past WARC/WET headers and we're into the actual body portion
		    // of the WET record,
		    // start incrementing the line counter.
		    if(lineCount >= 0 && !line.trim().equals("")) {
			lineCount++;
		    }
		    else if(line.startsWith(WARC_CONTENT_LENGTH_HEADER_PREFIX)) { // "Content-Length:"
			String contentLengthStr = line.substring(WARC_CONTENT_LENGTH_HEADER_PREFIX.length()).trim();
			contentLength = Integer.parseInt(contentLengthStr);
			lineCount = 0;
		    }
		    
		}
		
	    }

	    // flush the last record. If it was a warcinfo record, record would be null here
	    if(record != null) {
		processWETrecord(keepURLsWriter, discardURLsWriter, greyListedURLsWriter,
				 recordCount, contentLength, lineCount,
				 WARCtargetURI, record.toString());
		record = null;
	    }
	    
	} catch(IOException ioe) {
	    ioe.printStackTrace();
	}

	return recordCount;
    }

    public int getRecordCount() { return this.recordCount; }

    /**
     * Determines if a WET record belongs in the keep or discard pile depending on if it
     * contains enough text, based on contentLength and line count of the record body.
     * Then writes out the WET record to a uniquely named file in the keep or discard folder,
     * and writes out the WET record's URL to the keepURLs.txt file or discardURLs.txt file.
     */
    private void processWETrecord(BufferedWriter keepURLsWriter, BufferedWriter discardURLsWriter,
				  BufferedWriter greyListedURLsWriter,
				  int recordID, int contentLength, int lineCount,
				  String recordURI, String record)
    {
	logger.info("CrawlID: CC-MAIN-" + this.crawlID
			   + " WET #" + this.WETFileID
			   + " record #" + recordID
			   + " - contentLength: " + contentLength
			   + " - lineCount: " + lineCount);
	logger.info("URI: " + recordURI);
	//logger.debug(record);
	//logger.info("--------------------------");

	File parentFolder = null;

	if(batchProcessor.isBlacklisted(recordURI)) {
	    
	    // explicit whitelisting overrides blacklisting
	    if(batchProcessor.isWhitelisted(recordURI)) {
		parentFolder = batchProcessor.keepFolder; //tentative
	    }
	    // if not whitelisted, then greylisting still overrides blacklisting
	    else if(batchProcessor.isGreylisted(recordURI)) {
		parentFolder = batchProcessor.greyListedFolder;
		logger.debug("@@@GREYLISTED");
	    }
	    else { // url was only blacklisted
		parentFolder = batchProcessor.discardFolder;
		logger.debug("@@@DISCARDING - blacklisted");
	    }
	}
	else if(batchProcessor.isGreylisted(recordURI)) { // e.g. products sites
	    // explicit whitelisting overrides greylisting
	    if(batchProcessor.isWhitelisted(recordURI)) {
		parentFolder = batchProcessor.keepFolder; // tentative
	    }
	    else {
		parentFolder = batchProcessor.greyListedFolder;
		logger.debug("@@@GREYLISTED");
	    }
	}

	// If URL was not blacklisted/greylisted, or was even explicitly whitelisted,
	// it still can't be in the keep list as it needs further inspection:
	// it needs sufficient content for language analysis.
	// We don't care about the combination of number of lines and content-length,
	// we just care about the number of "valid words" as defined by us.
	if(parentFolder != batchProcessor.greyListedFolder && parentFolder != batchProcessor.discardFolder) { // i.e. parentFolder == keepFolder if whiteListed || parentFolder == null
	    
	    // If a web page's WET record contains a certain minimum number of words,
	    // we will think it's a meaningful web page and has sufficient content for text analysis
	    // to have been successful. Cut off values at present are:
	    // - a minimum of 20 words
	    // - a word consists of 1 to 15 chars demarcated by spaces. Any more chars may point
	    // to words having been glued together. This is used by irrelevant sites and moreover
	    // can't be analysed for language, so may not be actually MRI.
	    
	    // Though StringTokenizer still in use, as seen in discussion at
	    // https://stackoverflow.com/questions/6983856/why-is-stringtokenizer-deprecated
	    // String.split(regex) seems better for splitting on general whitespace
	    String[] allWords = record.split("\\s");
	    int validWordCount = 0;
	    int numCamelCaseWords = 0;
	    for(int i = 0; i < allWords.length; i++) {
		String word = allWords[i];

		// throw away if n words contain camelcase, which is another case of words glued together
		// For now, we'll only skip camelcased words in our count of valid words
		if(word.matches(".*[a-z][A-Z].*") && word.length() >= 5) {
		    numCamelCaseWords++;		    
		}		
		// In Maori, word length of 1 is not uncommon
		// but let's skip camelcased words when counting valid words
		else if(word.length() >= 1 && word.length() <= batchProcessor.MAX_WORD_LENGTH) {
		    validWordCount++;
		}
	    }

	    
	    /*
	    // dump if too many camelcase words (ideally keep no WET record of that kind?)
	    if(numCamelCaseWords >= batchProcessor.MAX_WORDS_CAMELCASE) {
		parentFolder = batchProcessor.discardFolder;
		logger.debug("@@@DISCARDING - CAMELCASED CONTENTS");
	    }
	    else*/
	    // For now, don't discount content with too many camelcased words
	    // Just focus on whether there are a sufficient number of valid words
	    // (camelcased words are however still ignored in our count of valid words)
	    if(validWordCount >= batchProcessor.MIN_NUM_WORDS) { // otherwise, keep anything with a sufficient number of valid words
		parentFolder = batchProcessor.keepFolder;
		logger.debug("@@@KEEPING");
	    }
	}
	// if parentFolder still not set, it means that the content length/num words or lines
	// were insufficient, so meant to be discarded
	if(parentFolder == null) {
	    parentFolder = batchProcessor.discardFolder;
	    logger.debug("@@@DISCARDING");
	}

	try {
	    if (parentFolder == batchProcessor.keepFolder) {
		keepURLsWriter.write(recordURI + "\n");
	    } else if (parentFolder == batchProcessor.greyListedFolder) {
		greyListedURLsWriter.write(recordURI + "\n");
	    } else {
		discardURLsWriter.write(recordURI + "\n");
	    }
	} catch(Exception e) {
	    logger.debug("Unable to write URL");
	    e.printStackTrace();
	}
   
	logger.debug("--------------------------");

	// outFilename will look something like YYYY-##-####
	String outFilename = String.format("%s-%02d-%04d", this.crawlID, this.WETFileID, recordID);
	    //= this.crawlID + "-" + String.format("%02d", this.WETFileID) + "-" + String.format("%04d.txt", recordID);
	File outFile = new File(parentFolder, outFilename);
	
	try (BufferedWriter writer = new BufferedWriter(new FileWriter(outFile))) {
	    writer.write(record);
	    // Try-with-resources examples don't call close() explicitly:
	    // https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html
	    //writer.close();
	} catch(IOException ioe) {
	    ioe.printStackTrace();
	    logger.error("@@@@@@@@@ Error writing to file " + outFile, ioe);
	}
    }


}