package org.greenstone.atea;

import java.io.*;
import java.lang.ArrayIndexOutOfBoundsException;
import java.time.LocalDateTime;
import java.util.ArrayList;
import java.util.Arrays;

import org.apache.commons.csv.*;
import org.apache.log4j.Logger;

//import org.bson.types.ObjectId;
    
import org.greenstone.atea.morphia.*;


/**
 * Class to process the dump text files produced FOR EACH SITE (e.g. site "00001") that
 * Nutch has finished crawling and whose text has been dumped out to a file called dump.txt.
 * This reads in the dump.txt file contained in each site folder within the input folder.
 * (e.g. input folder "crawled" could contain folders 00001 to 01465. Each contains a dump.txt)
 * Each dump.txt could contain the text contents for an entire site, or for individual pages.
 * This class then uses class TextDumpPage to parse each webpage within a dump.txt,
 * which parses out the actual text body content of each webpage's section within a dump.txt.
 * Finally, MaoriTextDetector is run over that to determine whether the full body text is
 * likely to be in Maori or not.
 *
 * Potential issues: since a web page's text is dumped out by nutch with neither paragraph
 * nor even newline separator, it's hard to be sure that the entire page is in language.
 * If it's in multiple languages, there's no way to be sure there aren't promising Maori language
 * paragraphs contained in a page, if the majority/the remainder happen to be in English.
 * 
 * So if we're looking for any paragraphs in Maori to store in a DB, perhaps it's better to run
 * the MaoriTextDetector.isTextInMaori(BufferedReader reader) over two "lines" at a time,
 * instead of running it over the entire html body's text.
 *
 * TO COMPILE OR RUN, FIRST DO:
 *    cd maori-lang-detection/apache-opennlp-1.9.1
 *    export OPENNLP_HOME=`pwd`
 *    cd maori-lang-detection/src
 *
 * MORE IMPORTANT PRELIMINARIES:
 * - Make sure the MongoDB is up and running and accessible.
 * - If you want to keep any existing MongoDB collections called Websites and Webpages, then
 * first renamed those collections in MongoDB (using Robo3T makes renaming easy) before
 * running this program.
 *
 * TO COMPILE:
 *    maori-lang-detection/src$
 *       javac -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToMongoDB.java
 *
 * TO RUN:
 *    maori-lang-detection/src$
 *       java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToMongoDB ../crawled-small
 *
 * or:
 *       java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToMongoDB ../crawled-small > ../crawled-small/bla.txt 2>&1
 *
*/
public class NutchTextDumpToMongoDB {
    static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpToMongoDB.class.getName());

    static boolean DEBUG_MODE = true; // this is set to false in main() at the end of this class
       
    /** Counter for number of sites.
     * Should be equal to number of times NutchTextDumpToMongoDB constructor
     * is called: once per site.
     */
    static private int SITE_COUNTER = 0;
    static private long WEBPAGE_COUNTER = 0;
    
    private final MaoriTextDetector maoriTxtDetector;
    private final MongoDBAccess mongodbAccess;
    
    public final String siteID;
    public final boolean siteCrawlUnfinished;
    public final long siteCrawledTimestamp; /** When the crawl of the site terminated */

    // private handle to a csv writer
    private CSVPrinter emptyWebPageInfoCSVPrinter;
    
    private int countOfWebPagesWithBodyText = 0;
    
    private String geoLocationCountryCode = null; /** 2 letter country code */
    private boolean urlContainsLangCodeInPath = false; /** If any URL on this site contains a /mi(/) or http(s)://mi.* in its URL path */

    private String domainOfSite;
    //private String baseSiteDomain; // domainOfSite stripped of any http(s)://www.
    private int numPagesInMRI = 0;
    private int numPagesContainingMRI = 0;
    
    /** keep a list to store the text of each page */
    private ArrayList<TextDumpPage> pages;


    /** Number of language and confidence results to return for storing in MongoDB
     * MongoDB runs out of space if storing too many, as we store this info per sentence
     * and a long text document becomes a very large MongoDB document presumably */
    private static final int NUM_TOP_LANGUAGES = 3; // 103 max, in current version of opennlp lang model

    
    private boolean isStartOfNewWebPageRecord(String prevLine, String line) {
	// The start of a new web page's record in nutch's text dump of an entire site
	// is denoted by a newline followed by a URL (protocol)
	// or the very start of the file with a URL (protocol)
	return ((prevLine == null || prevLine.equals(""))
		&& (line.startsWith("http://") || line.startsWith("https://")));
    }

    public void debugPageDump(StringBuilder pageDump) {
	if(DEBUG_MODE) {
	    // START DEBUG
	    logger.debug("__________________________________________");
	    logger.debug("@@@ Found page entry: ");
	    logger.debug("__________________________________________");
	    logger.debug(pageDump.toString());
	    logger.debug("------------------------------------------");
	    // END DEBUG
	}
    }

    /** A NutchTextDumpToMongoDB processes the dump.txt for one site */
    public NutchTextDumpToMongoDB(MongoDBAccess mongodbAccess, CSVPrinter emptyWebPageInfoCSVPrinter,
				  MaoriTextDetector maoriTxtDetector, String siteID,
				  File txtDumpFile, long lastModified, boolean siteCrawlUnfinished)
	throws IOException
    {	
	// increment static counter of sites processed by a NutchTextDumpToMongoDB instance
	SITE_COUNTER++;

	// keep a handle to the csv file writer
	this.emptyWebPageInfoCSVPrinter = emptyWebPageInfoCSVPrinter;
	
	// siteID is of the form %5d (e.g. 00020) and is just the name of a site folder
	this.siteID = siteID;
	this.siteCrawlUnfinished = siteCrawlUnfinished;
	this.siteCrawledTimestamp = lastModified;
	
	this.maoriTxtDetector = maoriTxtDetector;
	this.mongodbAccess = mongodbAccess;
	
	pages = new ArrayList<TextDumpPage>();

	String line = null;
	StringBuilder pageDump = null;
	try (
	     BufferedReader reader = new BufferedReader(new FileReader(txtDumpFile));
	     ) {

	    boolean readingText = false;
	    String prevLine = null;
	    
	    while((line = reader.readLine()) != null) { // readLine removes newline separator
		line = line.trim();
		// iff outside of a page's body text, then an empty line marks the end of a page
		// in nutch's text dump of a site.
		// But note, there can be an empty line (or more?) between the start and end
		// markers of a page's text, though.

		if(isStartOfNewWebPageRecord(prevLine, line)) {

		    if(pageDump != null) { // should also be the case then: if(prevLine != null)
			// finish old pageDump and begin new one			

			//debugPageDump(pageDump);
			    
			TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
			// parses the fields and body text of a webpage in nutch's txt dump of entire site
			//page.parseFields();
			//page.getText();
			pages.add(page);
			inspectPageURLPath(page);
			pageDump = null;

		    }		    

		    // begin new webpage dump
		    pageDump = new StringBuilder();
		    pageDump.append(line);
		    pageDump.append("\n");
		    
		}
		else if(!line.equals("")) {
		    pageDump.append(line);
		    pageDump.append("\n");

		}
		// can throw away any newlines between text start and end markers.
		
		prevLine = line;
	    }

	    // process final webpage record:
	    //debugPageDump(pageDump);

	    if(pageDump == null) {
		logger.warn("siteID " + siteID + " had an empty dump.txt file. Reinspect site.");
	    } else {
		TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
		pages.add(page);
		inspectPageURLPath(page);
		pageDump = null;
	    }	

		
	} catch (IOException ioe) {
	    logger.error("@@@@@@@@@ Error reading in nutch txtdump file " + txtDumpFile, ioe);
	}

	// Just do this once: get and store domain of site.
	// Passing true to get domain with protocol prefix
	if(pages.size() > 0) {
	    TextDumpPage firstPage = pages.get(0);
	    String url = firstPage.getPageURL();
	    this.domainOfSite = Utility.getDomainForURL(url, true);
	    //this.baseSiteDomain = Utility.stripProtocolAndWWWFromURL(this.domainOfSite);
	}
	else {
	    this.domainOfSite = "UNKNOWN";
	    //this.baseSiteDomain = "UNKNOWN";
	}
	
	/* No need to loop again through all pages. Instead, just inspectPageURLPath() as each page is created above.
	// For any site, we just need to work out if any of its pages contains /mi(/) or http(s)://mi.* in its URL path
	for(TextDumpPage aPage : pages) {
	    inspectPageURLPath(aPage);
	}
	*/
	webPageDataToMongoDB(mongodbAccess);
    }
    

    /** for every site, we just need to work out if any of its pages contains /mi(/) or http(s)://mi. in its URL.
      *	This method is called on each page of a site as the page is created. */
    private void inspectPageURLPath(TextDumpPage page) {
	String url = page.getPageURL();
	//logger.debug("@@@@ pageURL: " + url);
	
	if(!this.urlContainsLangCodeInPath) { // if not already set to true for any previous page in this site,
	    // check if this page of the site contains /mi(/) or http(s)://mi in its URL path
	    if(url.contains("/mi/") || url.endsWith("/mi") || url.startsWith("https://mi.") || url.startsWith("http://mi.")) {
		this.urlContainsLangCodeInPath = true;
	    }
	}
    }
    
    
    private void webPageDataToMongoDB(MongoDBAccess mongodbAccess) throws IOException {
    
	TextDumpPage page = null;
	for(int i = 0; i < pages.size(); i++) {
	    
	    page = pages.get(i);
	    
	    String text = page.getPageText();
	    
	    if(text.equals("")) {
		System.err.println(siteID + ",Empty page " + i + "," + page.getPageURL()
				   + "," + page.get("status")
				   + "," + page.get("protocolStatus")
				   + "," + page.get("parseStatus"));
		// write information about any empty web page into the emptyPage csv file
		emptyWebPageInfoCSVPrinter.printRecord(siteID, i, page.getPageURL(),
		       page.get("status"), page.get("protocolStatus"),page.get("parseStatus"));
		
		// don't care about empty pages
		continue;
	    }
	    else {
		WEBPAGE_COUNTER++; // count of cumulative total of webpages for all sites
		countOfWebPagesWithBodyText++; // of this site alone				
		
		boolean isMRI = maoriTxtDetector.isTextInMaori(text);
		if(isMRI) {
		    numPagesInMRI++;
		}
		
		String[] sentences = maoriTxtDetector.getAllSentences(text);
		int totalSentences = sentences.length;
		int numSentencesInMRI = 0;
		ArrayList<SentenceInfo> singleSentences = maoriTxtDetector.getAllSentencesInfo(sentences, NUM_TOP_LANGUAGES);
		ArrayList<SentenceInfo> overlappingSentences = maoriTxtDetector.getAllOverlappingSentencesInfo(sentences, NUM_TOP_LANGUAGES);

		WebpageInfo webpage = page.convertStoredDataToWebpageInfo(WEBPAGE_COUNTER/*new ObjectId()*/,
								     this.siteID/*SITE_COUNTER*/,
								     isMRI,
								     totalSentences,
								     singleSentences,
								     overlappingSentences);

		
		for(SentenceInfo si : singleSentences) {
		    //LanguageInfo bestLanguage = si.languagesInfo[0];
		    //if(bestLanguage.langCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) {
		    if(si.bestLangCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) {
			numSentencesInMRI++;
		    }
		}
		
		
		webpage.setMRISentenceCount(numSentencesInMRI);
		webpage.setContainsMRI((numSentencesInMRI > 0));
		if(numSentencesInMRI > 0) { // if(numSentencesInMRI >= 5) {
		    // Not sure if we can trust that a single sentence detected as Maori on a page is really Maori
		    // But if at least 5 sentences are detected as Maori, it is more likely to be the case to be MRI?
		    numPagesContainingMRI++;
		}
		    
		//mongodbAccess.insertWebpageInfo(webpage);
		// Uses morphia to save to mongodb, see https://www.baeldung.com/mongodb-morphia
		mongodbAccess.datastore.save(webpage);
	    }
	}
    }
    

    public void websiteDataToDB() {
	
	
	// https://stackoverflow.com/questions/35183146/how-can-i-create-a-java-8-localdate-from-a-long-epoch-time-in-milliseconds
	// LocalDateTime date =
	//     LocalDateTime.ofInstant(Instant.ofEpochMilli(this.siteCrawledTimestamp), ZoneId.systemDefault());
	// String crawlTimestamp =
	//     date.format(DateTimeFormatter.ofPattern("yyyy-MM-dd")) + " " + date.format(DateTimeFormatter.ofPattern("HH:mm:ss"));

	boolean redoCrawl = false;

	if(this.siteCrawlUnfinished) {
	    // arbitrary decision, but need some indication that the MRI content was not close to one-off in the website
	    if(this.numPagesInMRI > 2) { 
		redoCrawl = true;
	    }
	}
	
	File geoLiteCityDatFile = new File(this.getClass().getClassLoader().getResource("GeoLiteCity.dat").getFile());
	try {
	    if(this.domainOfSite.equals("UNKNOWN")) { // for sites that had 0 webpages downloaded, we have no domain
		this.geoLocationCountryCode = "UNKNOWN";
	    } else {
		this.geoLocationCountryCode = Utility.getCountryCodeOfDomain(this.domainOfSite, geoLiteCityDatFile);
	    }
	} catch(Exception e) {	    
	    logger.error("*** For SiteID " + siteID + ", got exception: "  + e.getMessage(), e);

	    //if(this.domainOfSite.endsWith(".nz")) { // nz TLDs are worth counting
	    //this.geoLocationCountryCode = "NZ"; 
	    //}

	    // Help along identification of domain's country by construing TLDs if 2 letters after last period mark
	    int periodIndex = domainOfSite.length()-3;
	    // .com|org etc extensions that have 3 chars afte period mark will remain unknown
	    // 2 letter extensions will be considered TLD
	    if(periodIndex >=0 && domainOfSite.charAt(periodIndex) == '.' && ((periodIndex+1) < domainOfSite.length())) {
		// has a 2 letter TLD. Make it uppercase to match return value of Utility.getCountryCodeOfDomain() above
		String TLD = domainOfSite.substring(periodIndex+1);
		this.geoLocationCountryCode = TLD.toUpperCase();
	    } else {
		this.geoLocationCountryCode = "UNKNOWN"; // couldn't get the country code, so should also be UNKNOWN not null
	    }
	}

	int totalPages = pages.size();	

	WebsiteInfo website = new WebsiteInfo(/*SITE_COUNTER,*/ this.siteID,
	      this.domainOfSite, //this.baseSiteDomain,
	      totalPages, this.countOfWebPagesWithBodyText,
	      this.numPagesInMRI, this.numPagesContainingMRI,
	      this.siteCrawledTimestamp, this.siteCrawlUnfinished, redoCrawl,
	      this.geoLocationCountryCode, this.urlContainsLangCodeInPath);

	//mongodbAccess.insertWebsiteInfo(website);
	// Uses morphia to save to mongodb, see https://www.baeldung.com/mongodb-morphia
	mongodbAccess.datastore.save(website);
    }

    
    // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
   
    public static void printUsage() {
	System.err.println("Run this program as:");
	System.err.println("\tNutchTextDumpToMongoDB <path to 'crawled' folder>");
    }
    
    public static void main(String[] args) {
	if(args.length != 1) {
	    printUsage();
	    return;
	}

	File sitesDir = new File(args[0]);
	if(!sitesDir.exists() || !sitesDir.isDirectory()) {
	    logger.error("Error: " + args[0] + " does not exist or is not a directory");
	    return;
	}

	NutchTextDumpToMongoDB.DEBUG_MODE = false;
	
	try (
	     MongoDBAccess mongodb = new MongoDBAccess();
	     CSVPrinter emptyWebPageInfoCSVPrinter = new CSVPrinter(new FileWriter("InfoOnEmptyPagesNotInMongoDB.csv"), CSVFormat.DEFAULT.withQuoteMode(QuoteMode.MINIMAL));
	     ) {
	    
	    mongodb.connectToDB();
	    //mongodb.showCollections();

	    // write out csv column headings into the csv file on empty web pages
	    emptyWebPageInfoCSVPrinter.printRecord("siteID","pagenum","URL","(fetch)status","protocolStatus","parseStatus");
		
	    // print out the column headers for the websites csv file
	    // https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVPrinter.html
	    // OPTIONAL TODO: creating collections can be done here if dropping and recreating
	    
	    MaoriTextDetector mriTxtDetector = new MaoriTextDetector(true); // true: run silent
	    File[] sites = sitesDir.listFiles();

	    // sort site folders in alphabetical order
	    // https://stackoverflow.com/questions/7199911/how-to-file-listfiles-in-alphabetical-order
	    Arrays.sort(sites);
	    
	    for(File siteDir : sites) { // e.g. 00001
		if(siteDir.isDirectory()) {
		    // look for dump.txt
		    File txtDumpFile = new File(siteDir, "dump.txt");
		    if(!txtDumpFile.exists()) {
			logger.error("Text dump file " + txtDumpFile + " did not exist");
			continue;
		    }
		    
		    else {
			File UNFINISHED_FILE = new File(siteDir, "UNFINISHED");
			
			String siteID = siteDir.getName();
			if(siteID.contains("_")) {
			    logger.warn("*** Skipping site " + siteID + " as its dir name indicates it wasn't crawled properly.");
			    continue;
			}
			
			long lastModified = siteDir.lastModified();
			logger.debug("@@@ Processing siteID: " + siteID);		    
			NutchTextDumpToMongoDB nutchTxtDump = new NutchTextDumpToMongoDB(
				 mongodb, emptyWebPageInfoCSVPrinter, mriTxtDetector,
				 siteID, txtDumpFile, lastModified, UNFINISHED_FILE.exists());
			// now it's parsed all the web pages in the site's text dump

			// Let's print stats on each web page's detected language being MRI or not
			// and how many pages there were in the site in total.
			
			//nutchTxtDump.printSiteStats();
			
			nutchTxtDump.websiteDataToDB();
		    }
		}
		
	    }
	    
	} catch(Exception e) {
	    // can get an exception when instantiating NutchTextDumpToMongoDB instance
	    // or with CSV file
	    logger.error(e.getMessage(), e);
	}
    }
}