Context Navigation

source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/TextDumpPage.java@ 33582

Last change on this file since 33582 was 33582, checked in by ak19, 5 years ago
NutchTextDumpProcessor prints each crawled site's stats: number of webpages per crawled site and how many of those were detected by OpenNLP as being in Maori (mri). Needed to make a reusable method in CCWETProcessor as public and static.
File size: 4.9 KB

Line
1	package org.greenstone.atea;
2
3	import java.io.*;
4	import java.util.HashMap;
5	import java.util.Map;
6
7	import org.apache.log4j.Logger;
8
9
10	public class TextDumpPage {
11	private static Logger logger = Logger.getLogger(org.greenstone.atea.TextDumpPage.class.getName());
12
13	public static final String TEXT_START_MARKER="text:start:";
14	public static final String TEXT_END_MARKER="text:end:";
15
16	private Map<String, String> tuples;
17
18	public TextDumpPage(String siteID, String unparsedPageDump) {
19	tuples = new HashMap<String, String>();
20
21	try (
22	BufferedReader reader = new BufferedReader(new StringReader(unparsedPageDump));
23	) {
24
25	String line = reader.readLine(); // should have at least first line
26
27	// first line always has a "key:" somewhere after the pageURL
28	int endIndex = line.indexOf("key:");
29
30	String pageURL = line.substring(0, endIndex);
31	//String pageURL = (endIndex == -1) ? line : line.substring(0, endIndex);
32
33
34	tuples.put("pageURL", pageURL.trim());
35
36	//if(endIndex != -1) {
37	String key = line.substring(endIndex);
38	tuples.put("key", key.trim());
39	//} else {
40	//debug("@@@@ no key for pageURL: " + pageURL);
41	//}
42	/*
43	if(pageURL.contains(TEXT_END_MARKER)) {
44	debug("@@@@ TEXT_END_MARKER assigned to pageURL for page: ");
45	debug("+++++++++");
46	debug(unparsedPageDump);
47	debug("+++++++++");
48	}
49	*/
50
51	boolean readingPageText = false;
52	StringBuilder pageText = null;
53
54	// continue reading all other tuples for this page, if any
55	while((line = reader.readLine()) != null) {
56	line = line.trim();
57
58	// check if we're dealing with metadata or start/end of page's text body
59	// or actual text body
60
61	if(line.equals(TEXT_START_MARKER)) { // dealing with the page body text
62	pageText = new StringBuilder();
63	readingPageText = true;
64	}
65	else if(line.equals(TEXT_END_MARKER)) {
66	// finished with a page body
67	// Remove any FINAL artificial newline we introduced to a page's body text
68	tuples.put("pageText", pageText.toString().trim());
69	readingPageText = false;
70	pageText = null;
71	}
72	else {
73	if(readingPageText) { // So we're reading in the page text
74	pageText.append(line);
75	pageText.append("\n"); // there are no newlines within pageText
76	// but if there were newlines, add them back here as readLine() removes them
77	}
78	else { // dealing with the rest of the page dump's metadata
79	// QTODO: nutch's text dump output is problematic
80	// strange characters are in the stream and end up here
81	// and can't detect end of metadata or even end of line.
82	endIndex = line.indexOf(":");
83	if(endIndex != -1) {
84	String k = line.substring(0, endIndex);
85	String v = line.substring(endIndex+1);
86	tuples.put(k.trim(), v.trim());
87	} else {
88	if(NutchTextDumpProcessor.DEBUG_MODE) {
89	error("No meta key for meta: " + line);
90	error(unparsedPageDump);
91	}
92	}
93	}
94	}
95
96	}
97
98	// If the page had no pageText, add a "pageText" -> "" mapping
99	if(!tuples.containsKey("pageText")) {
100	tuples.put("pageText", "");
101	}
102
103
104	} catch (IOException ioe) {
105	error("@@@@@@@@@ Error reading in txtdump of a page.", ioe);
106	}
107
108
109	// START DEBUG
110	debugTuples();
111	// END DEBUG
112
113	}
114
115	public void debugTuples() {
116	if(NutchTextDumpProcessor.DEBUG_MODE) {
117	debug("__________________________________________");
118	for(Map.Entry<String, String> entry : tuples.entrySet()) {
119	String key = entry.getKey();
120	String value = entry.getValue();
121	debug(key + " - " + value);
122	}
123	debug("__________________________________________");
124	}
125	}
126
127
128	public String getPageURL() {
129	return tuples.get("pageURL");
130	}
131
132	public String getPageText() {
133	return tuples.get("pageText");
134	}
135
136	public String get(String key) {
137	return tuples.get(key);
138	}
139
140	public void add(String key, String value) {
141	tuples.put(key, value);
142	}
143
144	public void addMRILanguageStatus(boolean status) {
145	if(status) {
146	tuples.put("isMRI", "true");
147	} else {
148	tuples.put("isMRI", "false");
149	}
150	}
151
152	public boolean getMRILanguageStatus() {
153	String value = tuples.get("isMRI");
154	if(value == null) {
155	return false;
156	}
157	if(value.equals("true")) {
158	return true;
159	}
160	else {
161	return false;
162	}
163
164	}
165
166	// --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
167	public static void info(String msg) {
168	System.err.println(msg);
169	logger.info(msg);
170	}
171	public static void debug(String msg) {
172	System.err.println(msg);
173	logger.debug(msg);
174	}
175	public static void warn(String msg) {
176	System.err.println(msg);
177	logger.warn(msg);
178	}
179	public static void error(String msg) {
180	System.err.println(msg);
181	logger.error(msg);
182	}
183	public static void error(String msg, Exception e) {
184	logger.error(msg, e);
185	System.err.println("\n"+msg);
186	e.printStackTrace();
187	}
188
189	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: