source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/TextDumpPage.java@ 33634

Last change on this file since 33634 was 33634, checked in by ak19, 4 years ago

Rewrote NutchTextDumpProcessor as NutchTextDumpToMongoDB.java, which uses MongoDBAccess that now has insertWebpageInfo() and insertWebsiteInfo(). However, testing has been unsuccessful locally, despite the fact that authentication should be working, as I'm following the examples online to use the Credential object. It supposedly connects to the database, but database.listCollections() fails with an Unauthorized error. Nothing subsequent can be expected to work. I could do my preliminary testing against a small sample subset of crawled sites on vagrant where there is no authentication setup, but what if someone else wants to run this one day against a mongodb where they authentication is set up (the way TSG set it up for the mongodb they gave me access to). Then it still wouldn't work.

File size: 6.3 KB
Line 
1package org.greenstone.atea;
2
3import java.io.*;
4import java.util.ArrayList;
5import java.util.HashMap;
6import java.util.Map;
7
8import org.apache.log4j.Logger;
9
10
11public class TextDumpPage {
12 private static Logger logger = Logger.getLogger(org.greenstone.atea.TextDumpPage.class.getName());
13
14 public static final String TEXT_START_MARKER="text:start:";
15 public static final String TEXT_END_MARKER="text:end:";
16
17 private Map<String, String> tuples;
18
19 private boolean isMRI = false;
20
21 boolean DEBUG_MODE = false;
22
23 public TextDumpPage(String siteID, String unparsedPageDump) {
24 tuples = new HashMap<String, String>();
25
26 try (
27 BufferedReader reader = new BufferedReader(new StringReader(unparsedPageDump));
28 ) {
29
30 String line = reader.readLine(); // should have at least first line
31
32 // first line always has a "key:" somewhere after the pageURL
33 int endIndex = line.indexOf("key:");
34
35 String pageURL = line.substring(0, endIndex);
36 //String pageURL = (endIndex == -1) ? line : line.substring(0, endIndex);
37
38
39 tuples.put("pageURL", pageURL.trim());
40
41 //if(endIndex != -1) {
42 String key = line.substring(endIndex);
43 tuples.put("key", key.trim());
44 //} else {
45 //logger.debug("@@@@ no key for pageURL: " + pageURL);
46 //}
47 /*
48 if(pageURL.contains(TEXT_END_MARKER)) {
49 logger.debug("@@@@ TEXT_END_MARKER assigned to pageURL for page: ");
50 logger.debug("+++++++++");
51 logger.debug(unparsedPageDump);
52 logger.debug("+++++++++");
53 }
54 */
55
56 boolean readingPageText = false;
57 StringBuilder pageText = null;
58
59 // continue reading all other tuples for this page, if any
60 while((line = reader.readLine()) != null) {
61 line = line.trim();
62
63 // check if we're dealing with metadata or start/end of page's text body
64 // or actual text body
65
66 if(line.equals(TEXT_START_MARKER)) { // dealing with the page body text
67 pageText = new StringBuilder();
68 readingPageText = true;
69 }
70 else if(line.equals(TEXT_END_MARKER)) {
71 // finished with a page body
72 // Remove any FINAL artificial newline we introduced to a page's body text
73 tuples.put("pageText", pageText.toString().trim());
74 readingPageText = false;
75 pageText = null;
76 }
77 else {
78 if(readingPageText) { // So we're reading in the page text
79 pageText.append(line);
80 pageText.append("\n"); // there are no newlines within pageText
81 // but if there were newlines, add them back here as readLine() removes them
82 }
83 else { // dealing with the rest of the page dump's metadata
84 // QTODO: nutch's text dump output is problematic
85 // strange characters are in the stream and end up here
86 // and can't detect end of metadata or even end of line.
87 endIndex = line.indexOf(":");
88 if(endIndex != -1) {
89 String k = line.substring(0, endIndex);
90 String v = line.substring(endIndex+1);
91 if(k.startsWith("metadata")) {
92 k = k.substring("metadata".length());
93 }
94
95 tuples.put(k.trim(), v.trim());
96 } else {
97 if(DEBUG_MODE) {
98 logger.error("No meta key for meta: " + line);
99 logger.error(unparsedPageDump);
100 }
101 }
102 }
103 }
104
105 }
106
107 // If the page had no pageText, add a "pageText" -> "" mapping
108 if(!tuples.containsKey("pageText")) {
109 tuples.put("pageText", "");
110 }
111
112
113 } catch (IOException ioe) {
114 logger.error("@@@@@@@@@ Error reading in txtdump of a page.", ioe);
115 }
116
117
118 // START DEBUG
119 debugTuples();
120 // END DEBUG
121
122 }
123
124 public void debugTuples() {
125 if(DEBUG_MODE) {
126 logger.debug("__________________________________________");
127 for(Map.Entry<String, String> entry : tuples.entrySet()) {
128 String key = entry.getKey();
129 String value = entry.getValue();
130 logger.debug(key + " - " + value);
131 }
132 logger.debug("__________________________________________");
133 }
134 }
135
136
137 public String getPageURL() {
138 return tuples.get("pageURL");
139 }
140
141 public String getPageText() {
142 return tuples.get("pageText");
143 }
144
145 /* Dr Nichols suggested storing timestamp and char encoding. Not sure which timestamp
146 or encoding he meant, but storing 2 of several timestamps and selecting
147 original character encoding (presumably the char encoding of the page) out of 2
148 pieces of char encoding metadata to store. */
149 public String getModifiedTime() {
150 // is this the webpage's last mod time?
151 String time = tuples.get("modifiedTime");
152 time = time.equals("0") ? "" : time; // zero will be assumed to be epoch, rather than unset
153 return time;
154 }
155 public String getFetchTime() {
156 // is this the nutch crawl time
157 String time = tuples.get("fetchTime");
158 time = time.equals("0") ? "" : time; // zero will be assumed to be epoch, rather than unset
159 return time;
160
161 }
162 public String getOriginalCharEncoding() {
163 // is this the web page's char-encoding?
164 return tuples.get("OriginalCharEncoding");
165 }
166
167 public String get(String key) {
168 return tuples.get(key);
169 }
170
171 public void add(String key, String value) {
172 tuples.put(key, value);
173 }
174
175 /**
176 * IMPORTANT: This method deletes the data stored in this TextDumpPage object
177 * after converting relevant fields and parameters to a WebpageInfo object
178 */
179 public WebpageInfo convertStoredDataToWebpageInfo(
180 long webpageID, int websiteID, boolean isMRI, int totalSentences,
181 ArrayList<SentenceInfo> singleSentences, ArrayList<SentenceInfo> overlappingSentences)
182 {
183 // clear the map, after storing the important (meta)data
184 String pageText = getPageText();
185 String pageURL = getPageURL();
186 String charEncoding = getOriginalCharEncoding();
187 String modifiedTime = getModifiedTime();
188 String fetchTime = getFetchTime();
189
190 WebpageInfo webpage = new WebpageInfo(webpageID, websiteID,
191 pageText, pageURL, isMRI, totalSentences,
192 charEncoding, modifiedTime, fetchTime,
193 singleSentences, overlappingSentences);
194
195 tuples.clear();
196
197 return webpage;
198 }
199
200
201
202 /*
203 public void addMRILanguageStatus(boolean status) {
204 if(status) {
205 tuples.put("isMRI", "true");
206 } else {
207 tuples.put("isMRI", "false");
208 }
209 }
210
211 public boolean getMRILanguageStatus() {
212 String value = tuples.get("isMRI");
213 if(value == null) {
214 return false;
215 }
216 if(value.equals("true")) {
217 return true;
218 }
219 else {
220 return false;
221 }
222
223 }
224 */
225}
Note: See TracBrowser for help on using the repository browser.