Context Navigation

TextDumpPage.java@ 33652

Last change on this file since 33652 was 33652, checked in by ak19, 4 years ago
Introducing morphia subpackage
File size: 6.4 KB

Line
1	package org.greenstone.atea;
2
3	import java.io.*;
4	import java.util.ArrayList;
5	import java.util.HashMap;
6	import java.util.Map;
7
8	import org.apache.log4j.Logger;
9
10	import org.greenstone.atea.morphia.*;
11
12	public class TextDumpPage {
13	private static Logger logger = Logger.getLogger(org.greenstone.atea.TextDumpPage.class.getName());
14
15	public static final String TEXT_START_MARKER="text:start:";
16	public static final String TEXT_END_MARKER="text:end:";
17
18	private Map<String, String> tuples;
19
20	private boolean isMRI = false;
21
22	boolean DEBUG_MODE = false;
23
24	public TextDumpPage(String siteID, String unparsedPageDump) {
25	tuples = new HashMap<String, String>();
26
27	try (
28	BufferedReader reader = new BufferedReader(new StringReader(unparsedPageDump));
29	) {
30
31	String line = reader.readLine(); // should have at least first line
32
33	// first line always has a "key:" somewhere after the pageURL
34	int endIndex = line.indexOf("key:");
35
36	String pageURL = line.substring(0, endIndex);
37	//String pageURL = (endIndex == -1) ? line : line.substring(0, endIndex);
38
39
40	tuples.put("pageURL", pageURL.trim());
41
42	//if(endIndex != -1) {
43	String key = line.substring(endIndex);
44	tuples.put("key", key.trim());
45	//} else {
46	//logger.debug("@@@@ no key for pageURL: " + pageURL);
47	//}
48	/*
49	if(pageURL.contains(TEXT_END_MARKER)) {
50	logger.debug("@@@@ TEXT_END_MARKER assigned to pageURL for page: ");
51	logger.debug("+++++++++");
52	logger.debug(unparsedPageDump);
53	logger.debug("+++++++++");
54	}
55	*/
56
57	boolean readingPageText = false;
58	StringBuilder pageText = null;
59
60	// continue reading all other tuples for this page, if any
61	while((line = reader.readLine()) != null) {
62	line = line.trim();
63
64	// check if we're dealing with metadata or start/end of page's text body
65	// or actual text body
66
67	if(line.equals(TEXT_START_MARKER)) { // dealing with the page body text
68	pageText = new StringBuilder();
69	readingPageText = true;
70	}
71	else if(line.equals(TEXT_END_MARKER)) {
72	// finished with a page body
73	// Remove any FINAL artificial newline we introduced to a page's body text
74	tuples.put("pageText", pageText.toString().trim());
75	readingPageText = false;
76	pageText = null;
77	}
78	else {
79	if(readingPageText) { // So we're reading in the page text
80	pageText.append(line);
81	pageText.append("\n"); // there are no newlines within pageText
82	// but if there were newlines, add them back here as readLine() removes them
83	}
84	else { // dealing with the rest of the page dump's metadata
85	// QTODO: nutch's text dump output is problematic
86	// strange characters are in the stream and end up here
87	// and can't detect end of metadata or even end of line.
88	endIndex = line.indexOf(":");
89	if(endIndex != -1) {
90	String k = line.substring(0, endIndex);
91	String v = line.substring(endIndex+1);
92	if(k.startsWith("metadata")) {
93	k = k.substring("metadata".length());
94	}
95
96	tuples.put(k.trim(), v.trim());
97	} else {
98	if(DEBUG_MODE) {
99	logger.error("No meta key for meta: " + line);
100	logger.error(unparsedPageDump);
101	}
102	}
103	}
104	}
105
106	}
107
108	// If the page had no pageText, add a "pageText" -> "" mapping
109	if(!tuples.containsKey("pageText")) {
110	tuples.put("pageText", "");
111	}
112
113
114	} catch (IOException ioe) {
115	logger.error("@@@@@@@@@ Error reading in txtdump of a page.", ioe);
116	}
117
118
119	// START DEBUG
120	debugTuples();
121	// END DEBUG
122
123	}
124
125	public void debugTuples() {
126	if(DEBUG_MODE) {
127	logger.debug("__________________________________________");
128	for(Map.Entry<String, String> entry : tuples.entrySet()) {
129	String key = entry.getKey();
130	String value = entry.getValue();
131	logger.debug(key + " - " + value);
132	}
133	logger.debug("__________________________________________");
134	}
135	}
136
137
138	public String getPageURL() {
139	return tuples.get("pageURL");
140	}
141
142	public String getPageText() {
143	return tuples.get("pageText");
144	}
145
146	/* Dr Nichols suggested storing timestamp and char encoding. Not sure which timestamp
147	or encoding he meant, but storing 2 of several timestamps and selecting
148	original character encoding (presumably the char encoding of the page) out of 2
149	pieces of char encoding metadata to store. */
150	public String getModifiedTime() {
151	// is this the webpage's last mod time?
152	String time = tuples.get("modifiedTime");
153	time = time.equals("0") ? "" : time; // zero will be assumed to be epoch, rather than unset
154	return time;
155	}
156	public String getFetchTime() {
157	// is this the nutch crawl time
158	String time = tuples.get("fetchTime");
159	time = time.equals("0") ? "" : time; // zero will be assumed to be epoch, rather than unset
160	return time;
161
162	}
163	public String getOriginalCharEncoding() {
164	// is this the web page's char-encoding?
165	return tuples.get("OriginalCharEncoding");
166	}
167
168	public String get(String key) {
169	return tuples.get(key);
170	}
171
172	public void add(String key, String value) {
173	tuples.put(key, value);
174	}
175
176	/**
177	* IMPORTANT: This method deletes the data stored in this TextDumpPage object
178	* after converting relevant fields and parameters to a WebpageInfo object
179	*/
180	public WebpageInfo convertStoredDataToWebpageInfo(
181	long webpageID, String siteID /int websiteID/, boolean isMRI, int totalSentences,
182	ArrayList<SentenceInfo> singleSentences, ArrayList<SentenceInfo> overlappingSentences)
183	{
184	// clear the map, after storing the important (meta)data
185	String pageText = getPageText();
186	String pageURL = getPageURL();
187	String charEncoding = getOriginalCharEncoding();
188	String modifiedTime = getModifiedTime();
189	String fetchTime = getFetchTime();
190
191	WebpageInfo webpage = new WebpageInfo(webpageID, siteID/websiteID,/,
192	pageText, pageURL, isMRI, totalSentences,
193	charEncoding, modifiedTime, fetchTime,
194	singleSentences, overlappingSentences);
195
196	tuples.clear();
197
198	return webpage;
199	}
200
201
202
203	/*
204	public void addMRILanguageStatus(boolean status) {
205	if(status) {
206	tuples.put("isMRI", "true");
207	} else {
208	tuples.put("isMRI", "false");
209	}
210	}
211
212	public boolean getMRILanguageStatus() {
213	String value = tuples.get("isMRI");
214	if(value == null) {
215	return false;
216	}
217	if(value.equals("true")) {
218	return true;
219	}
220	else {
221	return false;
222	}
223
224	}
225	*/
226	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: other-projects/maori-lang-detection/src/org/greenstone/atea/TextDumpPage.java@ 33652

Download in other formats: