Context Navigation

TextDumpPage.java@ 33623

Last change on this file since 33623 was 33623, checked in by ak19, 4 years ago

Incorporated Dr Nichols earlier suggestion of storing page modified time and char-encoding metadata if present in the crawl dump output. Have done so, but neither modifiedTime nor fetchTime metadata of the dump file appear to be a webpage's actual modified time, as they're from 2019 and set around the period we've been crawling. 2. Moved getDomainFromURL() function from CCWETProcessor.java to Utility.java since it's been reused. 3. MongoDBAccess class successfully connects (at least, no exceptions) and uses the newly added properties in config.properties to make the connection.

File size: 5.3 KB

Line
1	package org.greenstone.atea;
2
3	import java.io.*;
4	import java.util.HashMap;
5	import java.util.Map;
6
7	import org.apache.log4j.Logger;
8
9
10	public class TextDumpPage {
11	private static Logger logger = Logger.getLogger(org.greenstone.atea.TextDumpPage.class.getName());
12
13	public static final String TEXT_START_MARKER="text:start:";
14	public static final String TEXT_END_MARKER="text:end:";
15
16	private Map<String, String> tuples;
17
18	public TextDumpPage(String siteID, String unparsedPageDump) {
19	tuples = new HashMap<String, String>();
20
21	try (
22	BufferedReader reader = new BufferedReader(new StringReader(unparsedPageDump));
23	) {
24
25	String line = reader.readLine(); // should have at least first line
26
27	// first line always has a "key:" somewhere after the pageURL
28	int endIndex = line.indexOf("key:");
29
30	String pageURL = line.substring(0, endIndex);
31	//String pageURL = (endIndex == -1) ? line : line.substring(0, endIndex);
32
33
34	tuples.put("pageURL", pageURL.trim());
35
36	//if(endIndex != -1) {
37	String key = line.substring(endIndex);
38	tuples.put("key", key.trim());
39	//} else {
40	//logger.debug("@@@@ no key for pageURL: " + pageURL);
41	//}
42	/*
43	if(pageURL.contains(TEXT_END_MARKER)) {
44	logger.debug("@@@@ TEXT_END_MARKER assigned to pageURL for page: ");
45	logger.debug("+++++++++");
46	logger.debug(unparsedPageDump);
47	logger.debug("+++++++++");
48	}
49	*/
50
51	boolean readingPageText = false;
52	StringBuilder pageText = null;
53
54	// continue reading all other tuples for this page, if any
55	while((line = reader.readLine()) != null) {
56	line = line.trim();
57
58	// check if we're dealing with metadata or start/end of page's text body
59	// or actual text body
60
61	if(line.equals(TEXT_START_MARKER)) { // dealing with the page body text
62	pageText = new StringBuilder();
63	readingPageText = true;
64	}
65	else if(line.equals(TEXT_END_MARKER)) {
66	// finished with a page body
67	// Remove any FINAL artificial newline we introduced to a page's body text
68	tuples.put("pageText", pageText.toString().trim());
69	readingPageText = false;
70	pageText = null;
71	}
72	else {
73	if(readingPageText) { // So we're reading in the page text
74	pageText.append(line);
75	pageText.append("\n"); // there are no newlines within pageText
76	// but if there were newlines, add them back here as readLine() removes them
77	}
78	else { // dealing with the rest of the page dump's metadata
79	// QTODO: nutch's text dump output is problematic
80	// strange characters are in the stream and end up here
81	// and can't detect end of metadata or even end of line.
82	endIndex = line.indexOf(":");
83	if(endIndex != -1) {
84	String k = line.substring(0, endIndex);
85	String v = line.substring(endIndex+1);
86	if(k.startsWith("metadata")) {
87	k = k.substring("metadata".length());
88	}
89
90	tuples.put(k.trim(), v.trim());
91	} else {
92	if(NutchTextDumpProcessor.DEBUG_MODE) {
93	logger.error("No meta key for meta: " + line);
94	logger.error(unparsedPageDump);
95	}
96	}
97	}
98	}
99
100	}
101
102	// If the page had no pageText, add a "pageText" -> "" mapping
103	if(!tuples.containsKey("pageText")) {
104	tuples.put("pageText", "");
105	}
106
107
108	} catch (IOException ioe) {
109	logger.error("@@@@@@@@@ Error reading in txtdump of a page.", ioe);
110	}
111
112
113	// START DEBUG
114	debugTuples();
115	// END DEBUG
116
117	}
118
119	public void debugTuples() {
120	if(NutchTextDumpProcessor.DEBUG_MODE) {
121	logger.debug("__________________________________________");
122	for(Map.Entry<String, String> entry : tuples.entrySet()) {
123	String key = entry.getKey();
124	String value = entry.getValue();
125	logger.debug(key + " - " + value);
126	}
127	logger.debug("__________________________________________");
128	}
129	}
130
131
132	public String getPageURL() {
133	return tuples.get("pageURL");
134	}
135
136	public String getPageText() {
137	return tuples.get("pageText");
138	}
139
140	/* Dr Nichols suggested storing timestamp and char encoding. Not sure which timestamp
141	or encoding he meant, but storing 2 of several timestamps and selecting
142	original character encoding (presumably the char encoding of the page) out of 2
143	pieces of char encoding metadata to store. */
144	public String getModifiedTime() {
145	// is this the webpage's last mod time?
146	String time = tuples.get("modifiedTime");
147	time = time.equals("0") ? "" : time; // zero will be assumed to be epoch, rather than unset
148	return time;
149	}
150	public String getFetchTime() {
151	// is this the nutch crawl time
152	String time = tuples.get("fetchTime");
153	time = time.equals("0") ? "" : time; // zero will be assumed to be epoch, rather than unset
154	return time;
155
156	}
157	public String getOriginalCharEncoding() {
158	// is this the web page's char-encoding?
159	return tuples.get("OriginalCharEncoding");
160	}
161
162	public String get(String key) {
163	return tuples.get(key);
164	}
165
166	public void add(String key, String value) {
167	tuples.put(key, value);
168	}
169
170	public void addMRILanguageStatus(boolean status) {
171	if(status) {
172	tuples.put("isMRI", "true");
173	} else {
174	tuples.put("isMRI", "false");
175	}
176	}
177
178	public boolean getMRILanguageStatus() {
179	String value = tuples.get("isMRI");
180	if(value == null) {
181	return false;
182	}
183	if(value.equals("true")) {
184	return true;
185	}
186	else {
187	return false;
188	}
189
190	}
191
192	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/TextDumpPage.java@ 33623

Download in other formats: