package org.greenstone.atea;

import java.io.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;

import org.apache.log4j.Logger;

import org.greenstone.atea.morphia.*;

public class TextDumpPage {
    private static Logger logger = Logger.getLogger(org.greenstone.atea.TextDumpPage.class.getName());
    
    public static final String TEXT_START_MARKER="text:start:";
    public static final String TEXT_END_MARKER="text:end:";
    
    private Map<String, String> tuples;

    private boolean isMRI = false;

    boolean DEBUG_MODE = false;
    
    public TextDumpPage(String siteID, String unparsedPageDump) {
	tuples = new HashMap<String, String>();
	
	try (
	     BufferedReader reader = new BufferedReader(new StringReader(unparsedPageDump));
	     ) {

	    String line = reader.readLine(); // should have at least first line

	    // first line always has a "key:" somewhere after the pageURL
	    int endIndex = line.indexOf("key:");

	    String pageURL = line.substring(0, endIndex);
	    //String pageURL = (endIndex == -1) ? line : line.substring(0, endIndex);	    
	    
	    
	    tuples.put("pageURL", pageURL.trim());

	    //if(endIndex != -1) {
	    String key = line.substring(endIndex);
	    tuples.put("key", key.trim());
	    //} else {
	    //logger.debug("@@@@ no key for pageURL: " + pageURL);
	    //}
	    /*
	    if(pageURL.contains(TEXT_END_MARKER)) {
		logger.debug("@@@@ TEXT_END_MARKER assigned to pageURL for page: ");
		logger.debug("+++++++++");
		logger.debug(unparsedPageDump);
		logger.debug("+++++++++");
	    }
	    */

	    boolean readingPageText = false;
	    StringBuilder pageText = null;

	    // continue reading all other tuples for this page, if any
	    while((line = reader.readLine()) != null) {
		line = line.trim();

		// check if we're dealing with metadata or start/end of page's text body
		// or actual text body
		
		if(line.equals(TEXT_START_MARKER)) { // dealing with the page body text
		    pageText = new StringBuilder();
		    readingPageText = true;
		}
		else if(line.equals(TEXT_END_MARKER)) {
		    // finished with a page body		    
		    // Remove any FINAL artificial newline we introduced to a page's body text
		    tuples.put("pageText", pageText.toString().trim());
		    readingPageText = false;
		    pageText = null;
		}
		else {
		    if(readingPageText) { // So we're reading in the page text
			pageText.append(line);
			pageText.append("\n"); // there are no newlines within pageText
			// but if there were newlines, add them back here as readLine() removes them
		    }
		    else { // dealing with the rest of the page dump's metadata
			// QTODO: nutch's text dump output is problematic
			// strange characters are in the stream and end up here
			// and can't detect end of metadata or even end of line.
			endIndex = line.indexOf(":");
			if(endIndex != -1) { 
			    String k = line.substring(0, endIndex);
			    String v = line.substring(endIndex+1);
			    if(k.startsWith("metadata")) {
				k = k.substring("metadata".length());
			    }
				
			    tuples.put(k.trim(), v.trim());
			} else {
			    if(DEBUG_MODE) {
				logger.error("No meta key for meta: " + line);
				logger.error(unparsedPageDump);
			    }
			}
		    }
		}

	    }

	    // If the page had no pageText, add a "pageText" -> "" mapping
	    if(!tuples.containsKey("pageText")) {
		tuples.put("pageText", "");
	    }

	    
	} catch (IOException ioe) {
	    logger.error("@@@@@@@@@ Error reading in txtdump of a page.", ioe);
	}

	
	// START DEBUG
	debugTuples();
	// END DEBUG
	
    }

    public void debugTuples() {
	if(DEBUG_MODE) {
	    logger.debug("__________________________________________");
	    for(Map.Entry<String, String> entry : tuples.entrySet()) {
		String key = entry.getKey();
		String value = entry.getValue();	    
		logger.debug(key + " - " + value);
	    }
	    logger.debug("__________________________________________");
	}
    }
    
    
    public String getPageURL() {
	return tuples.get("pageURL");
    }

    public String getPageText() {
	return tuples.get("pageText");
    }

    /* Dr Nichols suggested storing timestamp and char encoding. Not sure which timestamp
       or encoding he meant, but storing 2 of several timestamps and selecting
       original character encoding (presumably the char encoding of the page) out of 2
       pieces of char encoding metadata to store. */
    public String getModifiedTime() {	
	// is this the webpage's last mod time?	
	String time = tuples.get("modifiedTime");
	time = time.equals("0") ? "" : time; // zero will be assumed to be epoch, rather than unset
	return time;
    }    
    public String getFetchTime() {
	// is this the nutch crawl time
	String time = tuples.get("fetchTime");
	time = time.equals("0") ? "" : time; // zero will be assumed to be epoch, rather than unset
	return time;
	
    } 
    public String getOriginalCharEncoding() {
	// is this the web page's char-encoding?
	return tuples.get("OriginalCharEncoding");
    }
    
    public String get(String key) {
	return tuples.get(key);
    }

    public void add(String key, String value) {
	tuples.put(key, value);
    }

    /**
     * IMPORTANT: This method deletes the data stored in this TextDumpPage object
     * after converting relevant fields and parameters to a WebpageInfo object
     */
    public WebpageInfo convertStoredDataToWebpageInfo(
      long webpageID, String siteID /*int websiteID*/, boolean isMRI, int totalSentences,
      ArrayList<SentenceInfo> singleSentences, ArrayList<SentenceInfo> overlappingSentences)
    {
	// clear the map, after storing the important (meta)data
	String pageText = getPageText();
	String pageURL = getPageURL();
	String charEncoding = getOriginalCharEncoding();
	String modifiedTime = getModifiedTime();
	String fetchTime = getFetchTime();

	WebpageInfo webpage = new WebpageInfo(webpageID, siteID/*websiteID,*/,
					      pageText, pageURL, isMRI, totalSentences,
					      charEncoding, modifiedTime, fetchTime,
					      singleSentences, overlappingSentences);
	
	tuples.clear();

	return webpage;
    }
    

    /*
    public void addMRILanguageStatus(boolean status) {
	if(status) {
	    tuples.put("isMRI", "true");
	} else {
	    tuples.put("isMRI", "false");
	}
    }

    public boolean getMRILanguageStatus() {
	String value = tuples.get("isMRI");
	if(value == null) {
	    return false;
	}
	if(value.equals("true")) {
	    return true;
	}
	else {
	    return false;
	}

    }
    */
}