package org.greenstone.atea;

import java.io.*;
import java.util.Properties;
import java.util.zip.GZIPInputStream;

import org.apache.log4j.Logger;
import org.greenstone.util.SafeProcess; // from GS3's gutil.jar. For safely running wget/any process from Java.


/**
 * Ensure you have OPENNLP_HOME set to apache-opennlp's full path.
 * Then go into the src folder of this extension before compiling or running.
 * 
 * Compile as:
 *     maori-lang-detection/src$ javac -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar:../lib/*" org/greenstone/atea/NZTLDProcessor.java
 * 
 * Run as:
 *     maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar:../conf:../lib/*" org.greenstone.atea.NZTLDProcessor ../uniq-tld-nz-WET-urls-2019-08-13.txt
 */
public class NZTLDProcessor
{

    private static Logger logger = Logger.getLogger(org.greenstone.atea.NZTLDProcessor.class.getName());
    
    private Properties configProps = null;
    private boolean DEBUG_MODE = true;

    private final ExtractableTextFileFilter extractableTxtFilesFilter = new ExtractableTextFileFilter();    
    private final String SITES_DIR = "../dwn-sites";
    private static final String COMMONCRAWL_DATA_PREFIX = "https://commoncrawl.s3.amazonaws.com/";
    
    
    private void log(String msg) {
	System.err.println(msg);
	logger.info(msg);
    }

    private void debug(String msg) {
	System.err.println(msg);
	logger.debug(msg);
    }
    
    private void error(String msg) {
	System.err.println("### ERROR: " + msg);
	logger.error(msg);
    }
    
    private void warn(String msg) {
	System.err.println("*** WARN: " + msg);
	logger.warn(msg);
    }
    
    public NZTLDProcessor(File seedURLsFile) throws Exception {	
	log("In NZTLDProcessor constructor");
    
	if(DEBUG_MODE) {
	    warn("Debugger is turned ON!!!");
	}
	
	// TODO: why do I have to provide the path to config.props when this path is already on
	// the classpath?
	// In fact, I don't need it on the classpath for the following to work.
	// How do I get it to work by specifying its path on the classpath and not here?
	//try (FileInputStream infile = new FileInputStream("../lib/config.properties")) {
	//try (InputStream infile = Class.forName("org.greenstone.atea.NZTLDProcessor").getClassLoader().getResourceAsStream("config.properties")) {
	try (InputStream infile = this.getClass().getClassLoader().getResourceAsStream("config.properties")) {
	    configProps = new Properties();
	    configProps.load(infile);
	    infile.close();
	    
	} catch(Exception e) {
	    System.err.println("Exception attempting to read properties from config.properties.");
	    //e.printStackTrace();
	    throw e;
	}

	
	try(BufferedReader br = new BufferedReader(new FileReader(seedURLsFile))) {
	    String url = null;

	    int maxURLs = 1;
	    int urlCounter = 0;
	    
	    while((url = br.readLine()) != null) {
		
		log("Got URL: " + url);

		// skip urls containing "/crawldiagnostics/" and "/robotstxt/"
		if(url.indexOf("/robotstxt/") != -1) continue;
		if(url.indexOf("/crawldiagnostics/") != -1) continue;

		// convert the remaining WARC urls to WET urls
		url = url.replace("/warc/CC-MAIN-", "/wet/CC-MAIN-");
		url = url.replace(".warc.gz", ".warc.wet.gz");

		// add the prefix
		url = COMMONCRAWL_DATA_PREFIX + url;
		
		log("Final WET URL: " + url);
		    
		processURL(url);
		
		if(DEBUG_MODE) {
		    urlCounter++;
		    if(urlCounter == maxURLs) break;
		}
		
	    }

	    br.close();
	    
	} catch (IOException ioe) {
	    System.err.println("Exception attempting to read from seedURLsFile.");
	    //ioe.printStackTrace();
	    throw ioe;
	}
    }

    public boolean processURL(String ccWETfileURL) {
	// launch wget on URL
	// when download done, recurse through downloaded dir
	// for each htm/html/no-extension file: parse out text from paragraphs (TODO: later Tika on non html/htm/no-extension files)
	// For each para, run MaoriTextDetector and store results in db
	// Remove download directory (or only remove if site doesn't contain Maori txt)?


	if(DEBUG_MODE) {
	    ccWETfileURL = "https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-30/segments/1563195523840.34/wet/CC-MAIN-20190715175205-20190715200204-00034.warc.wet.gz";
	}
	log("Processing WET file URL: " + ccWETfileURL);

	/*
	//if(wgetMirrorSite(ccWETfileURL)) {
	if(!DEBUG_MODE) { wgetMirrorSite(ccWETfileURL); } // TODO: take out debug_mode condition
	
	processDownloadedDir(ccWETfileURL);
	
	//}
	*/

	
	processWETfile(ccWETfileURL);
	
	
	return true;
    }

    public boolean processWETfile(String ccWETfileURL) {
	String zippedWETfileName = ccWETfileURL.substring(ccWETfileURL.lastIndexOf('/')+1);
	String unzippedWETfileName = zippedWETfileName.substring(0, zippedWETfileName.lastIndexOf(".gz"));

	File inZipFile = new File(SITES_DIR, zippedWETfileName);
	File WETfile = new File(SITES_DIR, unzippedWETfileName);
	
	if(WETfile.exists()) {
	    log("Unzipped WET file " + WETfile + " already exists");
	} 
	else {

	    if(inZipFile.exists()) {
		log("Not wgetting " + ccWETfileURL + " as " + inZipFile + " already exists");	
	    }
	    else if(!wgetWETfile(ccWETfileURL)) {
		return false;
	    }	

	    // don't have the WET file yet. Get it from the zip file, which we know we should have by now
	    
	    boolean success = Utility.unzipFile(inZipFile, WETfile);
	    log("Unzipped " + inZipFile + " to " + WETfile);
	    
	    // whether we succeeded or not, get rid of the zipped file:
	    if(!inZipFile.delete()) {
		warn("Unable to delete zipped WET file: " + zippedWETfileName);
	    }
	    
	    if(!success) {
		return false;
	    }
	}

	// read in the giant WET file and 
	
	return true;
    }

    /*
    // Run gunzip
    // To avoid making this linux specific, use Java to unzip, instead of running gunzip as process
    // https://www.mkyong.com/java/how-to-decompress-file-from-gzip-file/
    public boolean unzipFile(File inZipFile, File outFile) {
	
	
	byte[] buffer = new byte[1024];

	// try-with-resources will safely close streams/dispose resources on success or error and exceptions
	try (	    
	    GZIPInputStream gzis = new GZIPInputStream(new FileInputStream(inZipFile));	    
	    FileOutputStream out = new FileOutputStream(outFile);
	) {
	    int len;
	    while ((len = gzis.read(buffer)) > 0) {
        	out.write(buffer, 0, len);
	    }
	    
	    //gzis.close();
	    //out.close();
	    
	    log("Unzipped " + inZipFile + " to " + outFile);
	    
	} catch(IOException ex) {
	    error("Failed to unzip " + inZipFile);
	    ex.printStackTrace();
	    return false;
	}

	return true;
    }
    */
    
    // wget will be launched from the specified directory, SITES_DIR
    // Wget exit status/return codes: https://www.gnu.org/software/wget/manual/html_node/Exit-Status.html
    public boolean wgetWETfile(String ccWETfileURL) {
	// use SafeProcess and wget command in lib/config.properties
	// DONE: set up logging	
	
	String wgetCmd = configProps.getProperty("wget.file.cmd");
	if(wgetCmd == null || wgetCmd.equals("")) {
	    System.err.println("Invalid or empty wget.file.cmd in config.properties");
	    return false;
	}

	// replace the placeholder in the wget cmd for the seed url
	wgetCmd = wgetCmd.replace("%%FILE_URL%%", ccWETfileURL);
	
	log("Will launch wget with the command: " + wgetCmd);

	String[] wgetCommandArgs = wgetCmd.split(" ");
	SafeProcess wgetProcess = new SafeProcess(wgetCommandArgs, null, new File(SITES_DIR));
	
	SafeProcess.LineByLineHandler outLineHandler = new ProcessLineHandler(logger, SafeProcess.STDOUT);
	SafeProcess.LineByLineHandler errLineHandler = new ProcessLineHandler(logger, SafeProcess.STDERR);
	    
	int success = wgetProcess.runProcess(outLineHandler, errLineHandler);

	if(success != 0) {
	    System.err.println("Wget cmd \"" + wgetCmd  + "\" returned unsuccessfully with the value \"" + success + "\"");
	    return false;
	    	// TODO: why is return val = 6 ("Username/password authentication failure") for 3MB of downloads from www.waikato.ac.nz?
	}
	
	return true;
    }
    
    
    // wget will be launched from the specified directory, SITES_DIR
    // Wget exit status/return codes: https://www.gnu.org/software/wget/manual/html_node/Exit-Status.html
    public boolean wgetMirrorSite(String nzDomainURL) {
	// use SafeProcess and wget command in lib/config.properties
	// DONE: set up logging

	
	String wgetCmd = configProps.getProperty("wget.mirror.cmd");
	if(wgetCmd == null || wgetCmd.equals("")) {
	    System.err.println("Invalid or empty wget.mirror.cmd in config.properties");
	    return false;
	}

	// replace the placeholder in the wget cmd for the seed url
	wgetCmd = wgetCmd.replace("%%BASE_URL%%", nzDomainURL);
	
	log("Will launch wget with the command: " + wgetCmd);

	String[] wgetCommandArgs = wgetCmd.split(" ");
	SafeProcess wgetProcess = new SafeProcess(wgetCommandArgs, null, new File(SITES_DIR));
	
	SafeProcess.LineByLineHandler outLineHandler = new ProcessLineHandler(logger, SafeProcess.STDOUT);
	SafeProcess.LineByLineHandler errLineHandler = new ProcessLineHandler(logger, SafeProcess.STDERR);
	    
	int success = wgetProcess.runProcess(outLineHandler, errLineHandler);

	if(success != 0) {
	    System.err.println("Wget cmd \"" + wgetCmd  + "\" returned unsuccessfully with the value \"" + success + "\"");
	    return false;
	    	// TODO: why is return val = 6 ("Username/password authentication failure") for 3MB of downloads from www.waikato.ac.nz?
	}
	
	return true;
    }    

    
    // ----------- inner class for SafeProcess to Run Wget ----------
    public class ProcessLineHandler extends SafeProcess.LineByLineHandler
    {
	//static Logger logger = Logger.getLogger(this.getClass().getName());;
	Logger logger;
	
	public ProcessLineHandler(Logger logger, int src)
	{
	    super(src); // will set this.source to STDERR or STDOUT
	    //logger = Logger.getLogger(this.getClass().getName());
	    this.logger = logger;
	}

	public void gotLine(String line) { // first non-null line

	    // String form of this.source will be "stderr" or "stdout"
	    String msg = SafeProcess.streamToString(source) + ": " + line;
	    System.err.println(msg);
	    //NZTLDProcessor.this.logger.info(msg);
	    logger.info(msg);
	}
	public void gotException(Exception e) {
	    String msg = "Error in reading process' " + SafeProcess.streamToString(source);
	    //NZTLDProcessor.this.logger.error(msg, e);
	    logger.error(msg, e);
	}

    }


    public boolean processDownloadedDir(String ccWETfileURL) {
	// recurse through the downloaded directory, then process each file

	File downloadedSiteDir = new File(SITES_DIR, ccWETfileURL);
	log("*** Will process download dir " + downloadedSiteDir);
	
	if(!downloadedSiteDir.exists()) {
	    error("Expected downloaded site " + downloadedSiteDir + ", but it did not exist!");
	    return false;
	}
	/*
	if(!downloadedSiteDir.isDirectory()) {
	    error("Downloaded site " + downloadedSiteDir + " is not a directory!");
	    return false; // TODO: or redo wget with "www." prefixed to URL???
	}
	*/
	recursivelyProcessDir(ccWETfileURL, downloadedSiteDir);

	debug("Have set filefilter regex to exclude: " + ExtractableTextFileFilter.unacceptableRegex);
	
	return true;
    }

    private void recursivelyProcessDir(String ccWETfileURL, File file) {

	if(file.isDirectory()) { // recursive step
	    // children array null iff IO Exception OR if file not a directory,
	    // which it can't be since we tested isDirectory() on it just above
	    File[] children = file.listFiles(extractableTxtFilesFilter);
	    if(children == null) {
		error("IO error occurred when trying to list children of " + file);
	    } else {
		// children array will be empty if 'file' dir was empty
		for(int i = 0; i < children.length; i++) {
		    recursivelyProcessDir(ccWETfileURL, children[i]);
		}
	    }
	    
	} else { // base step
	    processFile(ccWETfileURL, file);	    
	}
    }	
    
    /*
    public boolean processDownloadedDir(String ccWETfileURL) {
	// recurse through the downloaded directory, then process each file

	File downloadedSiteDir = new File(SITES_DIR, ccWETfileURL);
	if(!downloadedSiteDir.exists()) {
	    error("Expected downloaded site " + downloadedSiteDir + ", but it did not exist!");
	    return false;
	}

	File[] files = downloadedSiteDir.listFiles(extractableTxtFilesFilter);
	if(files == null) {
	    error("Downloaded site dir " + downloadSiteDir + " was a file or an IO exception occurred");
	    return false;
	}
	recursivelyProcessDir(ccWETfileURL, files);
	
	return true;
    }

    private void recursivelyProcessDir(String ccWETfileURL, File[] children) {
	for(int i = 0; i < files.length; i++) {
	    if(files[i].isDirectory()) { // recursive step

		// children array will be empty if dir empty
		// children null iff IOException OR if files[i] were a file, which it can't be since we tested isDirectory on it just above
		File[] children = files[i].listFiles(extractableTxtFilesFilter);
		if(children == null) {
		    error("IO error trying to list children of " + files[i]);
		} else {
		    recursivelyProcessDir(ccWETfileURL, children);
		}
	    }
	    else { // base step
		processFile(ccWETfileURL, files[i]);
	    }
	}
    }*/
    
    public boolean processFile(String ccWETfileURL, File file) {
	// skip if js, css, robots.txt
	// if no-extension or htm or html, call processHTMLFile()
	// else: skip for now, TODO: later call Tika on other file types

	// TODO: update db with url info, page location and para?


	log("*** Processing file " + file);
	
	return true;
    }

    private class ExtractableTextFileFilter implements FilenameFilter {
	// skip if js, css, robots.txt
	
	// For now also skip the image files. Later, with Tika, may be able to extract text from
	// images though OCR of those imgs representing text? We accept files with no file extension,
	// e.g. waikato uni has lots of files without extension that contain html.
	// Examples: Files like *.png but also *.png?v=QE50XMk2oY should be rejected
	public static final String unacceptableRegex = ".*\\.(json|js|css|svg|ico|jpe?g|png|gif|tif?)($|\\?.*)";
	//Pattern unacceptableRegexPattern = Pattern.compile(unacceptableRegex);

	
	public boolean accept(File dir, String name) {
	    
	    return !(name.matches("robots.txt") || name.matches(unacceptableRegex));
	    //return !(name.matches("robots.txt") || name.matches("/\\.png/")); // doesn't work
	    //return name.matches(".*\\.html$"); // works
	    //return name.matches(".*\\.png($|\\?.*)"); // works
	}
    }

    
    public boolean processHTMLFile() {
	// Use JSoup to get paras

	// call processParas(text);

	return true;
    }

    public boolean processNonHTMLFile() {
	// Use Tika to get text	

	// call processParas(text)

	return true;
    }

    public boolean processParas(String text) {

	// Split on (double) new line for paras? If too short, then add text from next para unless there's no more. Else add to previous?
	
	// for each para, call MaoriTextDetector.java

	return true;
    }
    // --------------------------- End inner class --------------------

    public static void main(String[] args) {
	if(args.length <= 0) {
	    System.err.println("ERROR: Must provide input file of unique nz website addresses");
	    System.exit(-1);
	}
	
	String infilePath = args[0];

	System.err.println("Requested to process URLS file: "  + infilePath);

	File infile = new File(infilePath);
	if(!infile.exists()) {
	    System.err.println("ERROR: File " + infilePath + " did not exist");
	    System.exit(-1);
	}


	try {
	    NZTLDProcessor processor = new NZTLDProcessor(infile);
	} catch(Exception e) {
	    e.printStackTrace();
	}
	
    }
}