Context Navigation

TextDumpPage.java@ 33634

Last change on this file since 33634 was 33634, checked in by ak19, 4 years ago

Rewrote NutchTextDumpProcessor as NutchTextDumpToMongoDB.java, which uses MongoDBAccess that now has insertWebpageInfo() and insertWebsiteInfo(). However, testing has been unsuccessful locally, despite the fact that authentication should be working, as I'm following the examples online to use the Credential object. It supposedly connects to the database, but database.listCollections() fails with an Unauthorized error. Nothing subsequent can be expected to work. I could do my preliminary testing against a small sample subset of crawled sites on vagrant where there is no authentication setup, but what if someone else wants to run this one day against a mongodb where they authentication is set up (the way TSG set it up for the mongodb they gave me access to). Then it still wouldn't work.

File size: 6.3 KB

Line
1	package org.greenstone.atea;
2
3	import java.io.*;
4	import java.util.ArrayList;
5	import java.util.HashMap;
6	import java.util.Map;
7
8	import org.apache.log4j.Logger;
9
10
11	public class TextDumpPage {
12	private static Logger logger = Logger.getLogger(org.greenstone.atea.TextDumpPage.class.getName());
13
14	public static final String TEXT_START_MARKER="text:start:";
15	public static final String TEXT_END_MARKER="text:end:";
16
17	private Map<String, String> tuples;
18
19	private boolean isMRI = false;
20
21	boolean DEBUG_MODE = false;
22
23	public TextDumpPage(String siteID, String unparsedPageDump) {
24	tuples = new HashMap<String, String>();
25
26	try (
27	BufferedReader reader = new BufferedReader(new StringReader(unparsedPageDump));
28	) {
29
30	String line = reader.readLine(); // should have at least first line
31
32	// first line always has a "key:" somewhere after the pageURL
33	int endIndex = line.indexOf("key:");
34
35	String pageURL = line.substring(0, endIndex);
36	//String pageURL = (endIndex == -1) ? line : line.substring(0, endIndex);
37
38
39	tuples.put("pageURL", pageURL.trim());
40
41	//if(endIndex != -1) {
42	String key = line.substring(endIndex);
43	tuples.put("key", key.trim());
44	//} else {
45	//logger.debug("@@@@ no key for pageURL: " + pageURL);
46	//}
47	/*
48	if(pageURL.contains(TEXT_END_MARKER)) {
49	logger.debug("@@@@ TEXT_END_MARKER assigned to pageURL for page: ");
50	logger.debug("+++++++++");
51	logger.debug(unparsedPageDump);
52	logger.debug("+++++++++");
53	}
54	*/
55
56	boolean readingPageText = false;
57	StringBuilder pageText = null;
58
59	// continue reading all other tuples for this page, if any
60	while((line = reader.readLine()) != null) {
61	line = line.trim();
62
63	// check if we're dealing with metadata or start/end of page's text body
64	// or actual text body
65
66	if(line.equals(TEXT_START_MARKER)) { // dealing with the page body text
67	pageText = new StringBuilder();
68	readingPageText = true;
69	}
70	else if(line.equals(TEXT_END_MARKER)) {
71	// finished with a page body
72	// Remove any FINAL artificial newline we introduced to a page's body text
73	tuples.put("pageText", pageText.toString().trim());
74	readingPageText = false;
75	pageText = null;
76	}
77	else {
78	if(readingPageText) { // So we're reading in the page text
79	pageText.append(line);
80	pageText.append("\n"); // there are no newlines within pageText
81	// but if there were newlines, add them back here as readLine() removes them
82	}
83	else { // dealing with the rest of the page dump's metadata
84	// QTODO: nutch's text dump output is problematic
85	// strange characters are in the stream and end up here
86	// and can't detect end of metadata or even end of line.
87	endIndex = line.indexOf(":");
88	if(endIndex != -1) {
89	String k = line.substring(0, endIndex);
90	String v = line.substring(endIndex+1);
91	if(k.startsWith("metadata")) {
92	k = k.substring("metadata".length());
93	}
94
95	tuples.put(k.trim(), v.trim());
96	} else {
97	if(DEBUG_MODE) {
98	logger.error("No meta key for meta: " + line);
99	logger.error(unparsedPageDump);
100	}
101	}
102	}
103	}
104
105	}
106
107	// If the page had no pageText, add a "pageText" -> "" mapping
108	if(!tuples.containsKey("pageText")) {
109	tuples.put("pageText", "");
110	}
111
112
113	} catch (IOException ioe) {
114	logger.error("@@@@@@@@@ Error reading in txtdump of a page.", ioe);
115	}
116
117
118	// START DEBUG
119	debugTuples();
120	// END DEBUG
121
122	}
123
124	public void debugTuples() {
125	if(DEBUG_MODE) {
126	logger.debug("__________________________________________");
127	for(Map.Entry<String, String> entry : tuples.entrySet()) {
128	String key = entry.getKey();
129	String value = entry.getValue();
130	logger.debug(key + " - " + value);
131	}
132	logger.debug("__________________________________________");
133	}
134	}
135
136
137	public String getPageURL() {
138	return tuples.get("pageURL");
139	}
140
141	public String getPageText() {
142	return tuples.get("pageText");
143	}
144
145	/* Dr Nichols suggested storing timestamp and char encoding. Not sure which timestamp
146	or encoding he meant, but storing 2 of several timestamps and selecting
147	original character encoding (presumably the char encoding of the page) out of 2
148	pieces of char encoding metadata to store. */
149	public String getModifiedTime() {
150	// is this the webpage's last mod time?
151	String time = tuples.get("modifiedTime");
152	time = time.equals("0") ? "" : time; // zero will be assumed to be epoch, rather than unset
153	return time;
154	}
155	public String getFetchTime() {
156	// is this the nutch crawl time
157	String time = tuples.get("fetchTime");
158	time = time.equals("0") ? "" : time; // zero will be assumed to be epoch, rather than unset
159	return time;
160
161	}
162	public String getOriginalCharEncoding() {
163	// is this the web page's char-encoding?
164	return tuples.get("OriginalCharEncoding");
165	}
166
167	public String get(String key) {
168	return tuples.get(key);
169	}
170
171	public void add(String key, String value) {
172	tuples.put(key, value);
173	}
174
175	/**
176	* IMPORTANT: This method deletes the data stored in this TextDumpPage object
177	* after converting relevant fields and parameters to a WebpageInfo object
178	*/
179	public WebpageInfo convertStoredDataToWebpageInfo(
180	long webpageID, int websiteID, boolean isMRI, int totalSentences,
181	ArrayList<SentenceInfo> singleSentences, ArrayList<SentenceInfo> overlappingSentences)
182	{
183	// clear the map, after storing the important (meta)data
184	String pageText = getPageText();
185	String pageURL = getPageURL();
186	String charEncoding = getOriginalCharEncoding();
187	String modifiedTime = getModifiedTime();
188	String fetchTime = getFetchTime();
189
190	WebpageInfo webpage = new WebpageInfo(webpageID, websiteID,
191	pageText, pageURL, isMRI, totalSentences,
192	charEncoding, modifiedTime, fetchTime,
193	singleSentences, overlappingSentences);
194
195	tuples.clear();
196
197	return webpage;
198	}
199
200
201
202	/*
203	public void addMRILanguageStatus(boolean status) {
204	if(status) {
205	tuples.put("isMRI", "true");
206	} else {
207	tuples.put("isMRI", "false");
208	}
209	}
210
211	public boolean getMRILanguageStatus() {
212	String value = tuples.get("isMRI");
213	if(value == null) {
214	return false;
215	}
216	if(value.equals("true")) {
217	return true;
218	}
219	else {
220	return false;
221	}
222
223	}
224	*/
225	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/TextDumpPage.java@ 33634

Download in other formats: