package org.greenstone.atea; import java.io.*; import java.util.ArrayList; import java.util.HashMap; import java.util.Map; import org.apache.log4j.Logger; import org.greenstone.atea.morphia.*; public class TextDumpPage { private static Logger logger = Logger.getLogger(org.greenstone.atea.TextDumpPage.class.getName()); public static final String TEXT_START_MARKER="text:start:"; public static final String TEXT_END_MARKER="text:end:"; private Map tuples; private boolean isMRI = false; boolean DEBUG_MODE = false; public TextDumpPage(String siteID, String unparsedPageDump) { tuples = new HashMap(); try ( BufferedReader reader = new BufferedReader(new StringReader(unparsedPageDump)); ) { String line = reader.readLine(); // should have at least first line // first line always has a "key:" somewhere after the pageURL int endIndex = line.indexOf("key:"); String pageURL = line.substring(0, endIndex); //String pageURL = (endIndex == -1) ? line : line.substring(0, endIndex); tuples.put("pageURL", pageURL.trim()); //if(endIndex != -1) { String key = line.substring(endIndex); tuples.put("key", key.trim()); //} else { //logger.debug("@@@@ no key for pageURL: " + pageURL); //} /* if(pageURL.contains(TEXT_END_MARKER)) { logger.debug("@@@@ TEXT_END_MARKER assigned to pageURL for page: "); logger.debug("+++++++++"); logger.debug(unparsedPageDump); logger.debug("+++++++++"); } */ boolean readingPageText = false; StringBuilder pageText = null; // continue reading all other tuples for this page, if any while((line = reader.readLine()) != null) { line = line.trim(); // check if we're dealing with metadata or start/end of page's text body // or actual text body if(line.equals(TEXT_START_MARKER)) { // dealing with the page body text pageText = new StringBuilder(); readingPageText = true; } else if(line.equals(TEXT_END_MARKER)) { // finished with a page body // Remove any FINAL artificial newline we introduced to a page's body text tuples.put("pageText", pageText.toString().trim()); readingPageText = false; pageText = null; } else { if(readingPageText) { // So we're reading in the page text pageText.append(line); pageText.append("\n"); // there are no newlines within pageText // but if there were newlines, add them back here as readLine() removes them } else { // dealing with the rest of the page dump's metadata // QTODO: nutch's text dump output is problematic // strange characters are in the stream and end up here // and can't detect end of metadata or even end of line. endIndex = line.indexOf(":"); if(endIndex != -1) { String k = line.substring(0, endIndex); String v = line.substring(endIndex+1); if(k.startsWith("metadata")) { k = k.substring("metadata".length()); } tuples.put(k.trim(), v.trim()); } else { if(DEBUG_MODE) { logger.error("No meta key for meta: " + line); logger.error(unparsedPageDump); } } } } } // If the page had no pageText, add a "pageText" -> "" mapping if(!tuples.containsKey("pageText")) { tuples.put("pageText", ""); } } catch (IOException ioe) { logger.error("@@@@@@@@@ Error reading in txtdump of a page.", ioe); } // START DEBUG debugTuples(); // END DEBUG } public void debugTuples() { if(DEBUG_MODE) { logger.debug("__________________________________________"); for(Map.Entry entry : tuples.entrySet()) { String key = entry.getKey(); String value = entry.getValue(); logger.debug(key + " - " + value); } logger.debug("__________________________________________"); } } public String getPageURL() { return tuples.get("pageURL"); } public String getPageText() { return tuples.get("pageText"); } /* Dr Nichols suggested storing timestamp and char encoding. Not sure which timestamp or encoding he meant, but storing 2 of several timestamps and selecting original character encoding (presumably the char encoding of the page) out of 2 pieces of char encoding metadata to store. */ public String getModifiedTime() { // is this the webpage's last mod time? String time = tuples.get("modifiedTime"); time = time.equals("0") ? "" : time; // zero will be assumed to be epoch, rather than unset return time; } public String getFetchTime() { // is this the nutch crawl time String time = tuples.get("fetchTime"); time = time.equals("0") ? "" : time; // zero will be assumed to be epoch, rather than unset return time; } public String getOriginalCharEncoding() { // is this the web page's char-encoding? return tuples.get("OriginalCharEncoding"); } public String get(String key) { return tuples.get(key); } public void add(String key, String value) { tuples.put(key, value); } /** * IMPORTANT: This method deletes the data stored in this TextDumpPage object * after converting relevant fields and parameters to a WebpageInfo object */ public WebpageInfo convertStoredDataToWebpageInfo( long webpageID, String siteID /*int websiteID*/, boolean isMRI, int totalSentences, ArrayList singleSentences, ArrayList overlappingSentences) { // clear the map, after storing the important (meta)data String pageText = getPageText(); String pageURL = getPageURL(); String charEncoding = getOriginalCharEncoding(); String modifiedTime = getModifiedTime(); String fetchTime = getFetchTime(); WebpageInfo webpage = new WebpageInfo(webpageID, siteID/*websiteID,*/, pageText, pageURL, isMRI, totalSentences, charEncoding, modifiedTime, fetchTime, singleSentences, overlappingSentences); tuples.clear(); return webpage; } /* public void addMRILanguageStatus(boolean status) { if(status) { tuples.put("isMRI", "true"); } else { tuples.put("isMRI", "false"); } } public boolean getMRILanguageStatus() { String value = tuples.get("isMRI"); if(value == null) { return false; } if(value.equals("true")) { return true; } else { return false; } } */ }