source: other-projects/maori-lang-detection/src/org/greenstone/atea/TextDumpPage.java@ 33652

Last change on this file since 33652 was 33652, checked in by ak19, 4 years ago

Introducing morphia subpackage

File size: 6.4 KB
Line 
1package org.greenstone.atea;
2
3import java.io.*;
4import java.util.ArrayList;
5import java.util.HashMap;
6import java.util.Map;
7
8import org.apache.log4j.Logger;
9
10import org.greenstone.atea.morphia.*;
11
12public class TextDumpPage {
13 private static Logger logger = Logger.getLogger(org.greenstone.atea.TextDumpPage.class.getName());
14
15 public static final String TEXT_START_MARKER="text:start:";
16 public static final String TEXT_END_MARKER="text:end:";
17
18 private Map<String, String> tuples;
19
20 private boolean isMRI = false;
21
22 boolean DEBUG_MODE = false;
23
24 public TextDumpPage(String siteID, String unparsedPageDump) {
25 tuples = new HashMap<String, String>();
26
27 try (
28 BufferedReader reader = new BufferedReader(new StringReader(unparsedPageDump));
29 ) {
30
31 String line = reader.readLine(); // should have at least first line
32
33 // first line always has a "key:" somewhere after the pageURL
34 int endIndex = line.indexOf("key:");
35
36 String pageURL = line.substring(0, endIndex);
37 //String pageURL = (endIndex == -1) ? line : line.substring(0, endIndex);
38
39
40 tuples.put("pageURL", pageURL.trim());
41
42 //if(endIndex != -1) {
43 String key = line.substring(endIndex);
44 tuples.put("key", key.trim());
45 //} else {
46 //logger.debug("@@@@ no key for pageURL: " + pageURL);
47 //}
48 /*
49 if(pageURL.contains(TEXT_END_MARKER)) {
50 logger.debug("@@@@ TEXT_END_MARKER assigned to pageURL for page: ");
51 logger.debug("+++++++++");
52 logger.debug(unparsedPageDump);
53 logger.debug("+++++++++");
54 }
55 */
56
57 boolean readingPageText = false;
58 StringBuilder pageText = null;
59
60 // continue reading all other tuples for this page, if any
61 while((line = reader.readLine()) != null) {
62 line = line.trim();
63
64 // check if we're dealing with metadata or start/end of page's text body
65 // or actual text body
66
67 if(line.equals(TEXT_START_MARKER)) { // dealing with the page body text
68 pageText = new StringBuilder();
69 readingPageText = true;
70 }
71 else if(line.equals(TEXT_END_MARKER)) {
72 // finished with a page body
73 // Remove any FINAL artificial newline we introduced to a page's body text
74 tuples.put("pageText", pageText.toString().trim());
75 readingPageText = false;
76 pageText = null;
77 }
78 else {
79 if(readingPageText) { // So we're reading in the page text
80 pageText.append(line);
81 pageText.append("\n"); // there are no newlines within pageText
82 // but if there were newlines, add them back here as readLine() removes them
83 }
84 else { // dealing with the rest of the page dump's metadata
85 // QTODO: nutch's text dump output is problematic
86 // strange characters are in the stream and end up here
87 // and can't detect end of metadata or even end of line.
88 endIndex = line.indexOf(":");
89 if(endIndex != -1) {
90 String k = line.substring(0, endIndex);
91 String v = line.substring(endIndex+1);
92 if(k.startsWith("metadata")) {
93 k = k.substring("metadata".length());
94 }
95
96 tuples.put(k.trim(), v.trim());
97 } else {
98 if(DEBUG_MODE) {
99 logger.error("No meta key for meta: " + line);
100 logger.error(unparsedPageDump);
101 }
102 }
103 }
104 }
105
106 }
107
108 // If the page had no pageText, add a "pageText" -> "" mapping
109 if(!tuples.containsKey("pageText")) {
110 tuples.put("pageText", "");
111 }
112
113
114 } catch (IOException ioe) {
115 logger.error("@@@@@@@@@ Error reading in txtdump of a page.", ioe);
116 }
117
118
119 // START DEBUG
120 debugTuples();
121 // END DEBUG
122
123 }
124
125 public void debugTuples() {
126 if(DEBUG_MODE) {
127 logger.debug("__________________________________________");
128 for(Map.Entry<String, String> entry : tuples.entrySet()) {
129 String key = entry.getKey();
130 String value = entry.getValue();
131 logger.debug(key + " - " + value);
132 }
133 logger.debug("__________________________________________");
134 }
135 }
136
137
138 public String getPageURL() {
139 return tuples.get("pageURL");
140 }
141
142 public String getPageText() {
143 return tuples.get("pageText");
144 }
145
146 /* Dr Nichols suggested storing timestamp and char encoding. Not sure which timestamp
147 or encoding he meant, but storing 2 of several timestamps and selecting
148 original character encoding (presumably the char encoding of the page) out of 2
149 pieces of char encoding metadata to store. */
150 public String getModifiedTime() {
151 // is this the webpage's last mod time?
152 String time = tuples.get("modifiedTime");
153 time = time.equals("0") ? "" : time; // zero will be assumed to be epoch, rather than unset
154 return time;
155 }
156 public String getFetchTime() {
157 // is this the nutch crawl time
158 String time = tuples.get("fetchTime");
159 time = time.equals("0") ? "" : time; // zero will be assumed to be epoch, rather than unset
160 return time;
161
162 }
163 public String getOriginalCharEncoding() {
164 // is this the web page's char-encoding?
165 return tuples.get("OriginalCharEncoding");
166 }
167
168 public String get(String key) {
169 return tuples.get(key);
170 }
171
172 public void add(String key, String value) {
173 tuples.put(key, value);
174 }
175
176 /**
177 * IMPORTANT: This method deletes the data stored in this TextDumpPage object
178 * after converting relevant fields and parameters to a WebpageInfo object
179 */
180 public WebpageInfo convertStoredDataToWebpageInfo(
181 long webpageID, String siteID /*int websiteID*/, boolean isMRI, int totalSentences,
182 ArrayList<SentenceInfo> singleSentences, ArrayList<SentenceInfo> overlappingSentences)
183 {
184 // clear the map, after storing the important (meta)data
185 String pageText = getPageText();
186 String pageURL = getPageURL();
187 String charEncoding = getOriginalCharEncoding();
188 String modifiedTime = getModifiedTime();
189 String fetchTime = getFetchTime();
190
191 WebpageInfo webpage = new WebpageInfo(webpageID, siteID/*websiteID,*/,
192 pageText, pageURL, isMRI, totalSentences,
193 charEncoding, modifiedTime, fetchTime,
194 singleSentences, overlappingSentences);
195
196 tuples.clear();
197
198 return webpage;
199 }
200
201
202
203 /*
204 public void addMRILanguageStatus(boolean status) {
205 if(status) {
206 tuples.put("isMRI", "true");
207 } else {
208 tuples.put("isMRI", "false");
209 }
210 }
211
212 public boolean getMRILanguageStatus() {
213 String value = tuples.get("isMRI");
214 if(value == null) {
215 return false;
216 }
217 if(value.equals("true")) {
218 return true;
219 }
220 else {
221 return false;
222 }
223
224 }
225 */
226}
Note: See TracBrowser for help on using the repository browser.