Context Navigation

source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/TextDumpPage.java@ 33578

Last change on this file since 33578 was 33578, checked in by ak19, 5 years ago
Corrections for compiling the 2 new classes.
File size: 3.2 KB

Line
1	package org.greenstone.atea;
2
3	import java.io.*;
4	import java.util.HashMap;
5	import java.util.Map;
6
7	import org.apache.log4j.Logger;
8
9
10	public class TextDumpPage {
11	private static Logger logger = Logger.getLogger(org.greenstone.atea.TextDumpPage.class.getName());
12
13	public static final String TEXT_START_MARKER="text:start:";
14	public static final String TEXT_END_MARKER="text:end:";
15
16	private Map<String, String> tuples;
17
18	public TextDumpPage(String siteID, String unparsedPageDump) {
19	tuples = new HashMap<String, String>();
20
21	try (
22	BufferedReader reader = new BufferedReader(new StringReader(unparsedPageDump));
23	) {
24
25	String line = reader.readLine(); // should have at least first line
26
27	// first line always has a "key:" somewhere after the pageURL
28	int endIndex = line.indexOf("key:");
29	String pageURL = line.substring(endIndex);
30
31	tuples.put("pageURL", pageURL.trim());
32
33	String key = line.substring(endIndex);
34	tuples.put("key", key.trim());
35
36	boolean readingPageText = false;
37	StringBuilder pageText = null;
38
39	// continue reading all other tuples for this page, if any
40	while((line = reader.readLine()) != null) {
41
42	if(!readingPageText) {
43	// check if we're dealing with metadata or start/end of pagetext
44	endIndex = line.indexOf(":");
45	if(endIndex != -1) { // dealing with the rest of the page dump's metadata
46	String k = line.substring(0, endIndex);
47	String v = line.substring(endIndex+1);
48	tuples.put(k.trim(), v.trim());
49	}
50
51	else if(line.equals(TEXT_START_MARKER)) { // dealing with the page body text
52	pageText = new StringBuilder();
53	readingPageText = true;
54	}
55	}
56
57	else { // we're reading in the page text
58
59	if(line.equals(TEXT_END_MARKER)) {
60	// finished with a page body
61	// remove any FINAL artificial newline we introduced
62	tuples.put("pageText", pageText.toString().trim());
63	readingPageText = false;
64	pageText = null;
65	}
66	else {
67	pageText.append(line);
68	pageText.append("\n"); // there are no newlines within pageText
69	// but if there were newlines, add them back here as readLine() removes them
70	}
71
72	}
73	}
74
75	// If the page had no pageText, add a "pageText" -> "" mapping
76	if(!tuples.containsKey("pageText")) {
77	tuples.put("pageText", "");
78	}
79
80	} catch (IOException ioe) {
81	error("@@@@@@@@@ Error reading in txtdump of a page.", ioe);
82	}
83	}
84
85
86	public String getPageURL() {
87	return tuples.get("url");
88	}
89
90	public String getPageText() {
91	return tuples.get("pageText");
92	}
93
94	public String get(String key) {
95	return tuples.get(key);
96	}
97
98	// --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
99	public static void info(String msg) {
100	System.err.println(msg);
101	logger.info(msg);
102	}
103	public static void debug(String msg) {
104	System.err.println(msg);
105	logger.debug(msg);
106	}
107	public static void warn(String msg) {
108	System.err.println(msg);
109	logger.warn(msg);
110	}
111	public static void error(String msg) {
112	System.err.println(msg);
113	logger.error(msg);
114	}
115	public static void error(String msg, Exception e) {
116	logger.error(msg, e);
117	System.err.println("\n"+msg);
118	e.printStackTrace();
119	}
120
121	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: