Context Navigation

TextDumpPage.java@ 33978

Last change on this file since 33978 was 33652, checked in by ak19, 5 years ago
Introducing morphia subpackage
File size: 6.4 KB

Rev	Line
[33576]	1	package org.greenstone.atea;
	2
	3	import java.io.*;
[33634]	4	import java.util.ArrayList;
[33576]	5	import java.util.HashMap;
	6	import java.util.Map;
	7
[33578]	8	import org.apache.log4j.Logger;
	9
[33652]	10	import org.greenstone.atea.morphia.*;
[33578]	11
[33576]	12	public class TextDumpPage {
[33578]	13	private static Logger logger = Logger.getLogger(org.greenstone.atea.TextDumpPage.class.getName());
	14
[33576]	15	public static final String TEXT_START_MARKER="text:start:";
	16	public static final String TEXT_END_MARKER="text:end:";
	17
[33578]	18	private Map<String, String> tuples;
[33634]	19
	20	private boolean isMRI = false;
	21
	22	boolean DEBUG_MODE = false;
[33576]	23
	24	public TextDumpPage(String siteID, String unparsedPageDump) {
[33578]	25	tuples = new HashMap<String, String>();
[33576]	26
	27	try (
	28	BufferedReader reader = new BufferedReader(new StringReader(unparsedPageDump));
	29	) {
	30
	31	String line = reader.readLine(); // should have at least first line
	32
	33	// first line always has a "key:" somewhere after the pageURL
	34	int endIndex = line.indexOf("key:");
[33579]	35
[33580]	36	String pageURL = line.substring(0, endIndex);
	37	//String pageURL = (endIndex == -1) ? line : line.substring(0, endIndex);
[33576]	38
[33579]	39
[33578]	40	tuples.put("pageURL", pageURL.trim());
[33576]	41
[33580]	42	//if(endIndex != -1) {
	43	String key = line.substring(endIndex);
	44	tuples.put("key", key.trim());
	45	//} else {
[33615]	46	//logger.debug("@@@@ no key for pageURL: " + pageURL);
[33580]	47	//}
	48	/*
[33579]	49	if(pageURL.contains(TEXT_END_MARKER)) {
[33615]	50	logger.debug("@@@@ TEXT_END_MARKER assigned to pageURL for page: ");
	51	logger.debug("+++++++++");
	52	logger.debug(unparsedPageDump);
	53	logger.debug("+++++++++");
[33579]	54	}
[33580]	55	*/
[33579]	56
[33576]	57	boolean readingPageText = false;
	58	StringBuilder pageText = null;
	59
	60	// continue reading all other tuples for this page, if any
	61	while((line = reader.readLine()) != null) {
[33580]	62	line = line.trim();
	63
	64	// check if we're dealing with metadata or start/end of page's text body
	65	// or actual text body
[33576]	66
[33580]	67	if(line.equals(TEXT_START_MARKER)) { // dealing with the page body text
	68	pageText = new StringBuilder();
	69	readingPageText = true;
[33576]	70	}
[33580]	71	else if(line.equals(TEXT_END_MARKER)) {
	72	// finished with a page body
	73	// Remove any FINAL artificial newline we introduced to a page's body text
	74	tuples.put("pageText", pageText.toString().trim());
	75	readingPageText = false;
	76	pageText = null;
	77	}
	78	else {
	79	if(readingPageText) { // So we're reading in the page text
[33576]	80	pageText.append(line);
	81	pageText.append("\n"); // there are no newlines within pageText
	82	// but if there were newlines, add them back here as readLine() removes them
[33580]	83	}
	84	else { // dealing with the rest of the page dump's metadata
[33582]	85	// QTODO: nutch's text dump output is problematic
	86	// strange characters are in the stream and end up here
	87	// and can't detect end of metadata or even end of line.
[33580]	88	endIndex = line.indexOf(":");
	89	if(endIndex != -1) {
	90	String k = line.substring(0, endIndex);
	91	String v = line.substring(endIndex+1);
[33623]	92	if(k.startsWith("metadata")) {
	93	k = k.substring("metadata".length());
	94	}
	95
[33580]	96	tuples.put(k.trim(), v.trim());
	97	} else {
[33634]	98	if(DEBUG_MODE) {
[33615]	99	logger.error("No meta key for meta: " + line);
	100	logger.error(unparsedPageDump);
[33582]	101	}
[33580]	102	}
	103	}
	104	}
[33576]	105
	106	}
	107
	108	// If the page had no pageText, add a "pageText" -> "" mapping
	109	if(!tuples.containsKey("pageText")) {
[33578]	110	tuples.put("pageText", "");
[33576]	111	}
[33579]	112
[33576]	113
	114	} catch (IOException ioe) {
[33615]	115	logger.error("@@@@@@@@@ Error reading in txtdump of a page.", ioe);
[33576]	116	}
[33579]	117
[33580]	118
[33579]	119	// START DEBUG
[33582]	120	debugTuples();
[33579]	121	// END DEBUG
[33580]	122
[33576]	123	}
	124
[33582]	125	public void debugTuples() {
[33634]	126	if(DEBUG_MODE) {
[33615]	127	logger.debug("__________________________________________");
[33582]	128	for(Map.Entry<String, String> entry : tuples.entrySet()) {
	129	String key = entry.getKey();
	130	String value = entry.getValue();
[33615]	131	logger.debug(key + " - " + value);
[33582]	132	}
[33615]	133	logger.debug("__________________________________________");
[33582]	134	}
	135	}
[33576]	136
[33582]	137
[33576]	138	public String getPageURL() {
[33582]	139	return tuples.get("pageURL");
[33576]	140	}
	141
	142	public String getPageText() {
	143	return tuples.get("pageText");
	144	}
	145
[33623]	146	/* Dr Nichols suggested storing timestamp and char encoding. Not sure which timestamp
	147	or encoding he meant, but storing 2 of several timestamps and selecting
	148	original character encoding (presumably the char encoding of the page) out of 2
	149	pieces of char encoding metadata to store. */
	150	public String getModifiedTime() {
	151	// is this the webpage's last mod time?
	152	String time = tuples.get("modifiedTime");
	153	time = time.equals("0") ? "" : time; // zero will be assumed to be epoch, rather than unset
	154	return time;
	155	}
	156	public String getFetchTime() {
	157	// is this the nutch crawl time
	158	String time = tuples.get("fetchTime");
	159	time = time.equals("0") ? "" : time; // zero will be assumed to be epoch, rather than unset
	160	return time;
	161
	162	}
	163	public String getOriginalCharEncoding() {
	164	// is this the web page's char-encoding?
	165	return tuples.get("OriginalCharEncoding");
	166	}
	167
[33576]	168	public String get(String key) {
	169	return tuples.get(key);
	170	}
[33578]	171
[33582]	172	public void add(String key, String value) {
	173	tuples.put(key, value);
	174	}
	175
[33634]	176	/**
	177	* IMPORTANT: This method deletes the data stored in this TextDumpPage object
	178	* after converting relevant fields and parameters to a WebpageInfo object
	179	*/
	180	public WebpageInfo convertStoredDataToWebpageInfo(
[33652]	181	long webpageID, String siteID /int websiteID/, boolean isMRI, int totalSentences,
[33634]	182	ArrayList<SentenceInfo> singleSentences, ArrayList<SentenceInfo> overlappingSentences)
	183	{
	184	// clear the map, after storing the important (meta)data
	185	String pageText = getPageText();
	186	String pageURL = getPageURL();
	187	String charEncoding = getOriginalCharEncoding();
	188	String modifiedTime = getModifiedTime();
	189	String fetchTime = getFetchTime();
	190
[33652]	191	WebpageInfo webpage = new WebpageInfo(webpageID, siteID/websiteID,/,
[33634]	192	pageText, pageURL, isMRI, totalSentences,
	193	charEncoding, modifiedTime, fetchTime,
	194	singleSentences, overlappingSentences);
	195
	196	tuples.clear();
	197
	198	return webpage;
	199	}
	200
	201
	202
	203	/*
[33582]	204	public void addMRILanguageStatus(boolean status) {
	205	if(status) {
	206	tuples.put("isMRI", "true");
	207	} else {
	208	tuples.put("isMRI", "false");
	209	}
	210	}
	211
	212	public boolean getMRILanguageStatus() {
	213	String value = tuples.get("isMRI");
	214	if(value == null) {
	215	return false;
	216	}
	217	if(value.equals("true")) {
	218	return true;
	219	}
	220	else {
	221	return false;
	222	}
	223
	224	}
[33634]	225	*/
[33576]	226	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: other-projects/maori-lang-detection/src/org/greenstone/atea/TextDumpPage.java@ 33978

Download in other formats: