source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/TextDumpPage.java@ 33623

Last change on this file since 33623 was 33623, checked in by ak19, 4 years ago
  1. Incorporated Dr Nichols earlier suggestion of storing page modified time and char-encoding metadata if present in the crawl dump output. Have done so, but neither modifiedTime nor fetchTime metadata of the dump file appear to be a webpage's actual modified time, as they're from 2019 and set around the period we've been crawling. 2. Moved getDomainFromURL() function from CCWETProcessor.java to Utility.java since it's been reused. 3. MongoDBAccess class successfully connects (at least, no exceptions) and uses the newly added properties in config.properties to make the connection.
File size: 5.3 KB
Line 
1package org.greenstone.atea;
2
3import java.io.*;
4import java.util.HashMap;
5import java.util.Map;
6
7import org.apache.log4j.Logger;
8
9
10public class TextDumpPage {
11 private static Logger logger = Logger.getLogger(org.greenstone.atea.TextDumpPage.class.getName());
12
13 public static final String TEXT_START_MARKER="text:start:";
14 public static final String TEXT_END_MARKER="text:end:";
15
16 private Map<String, String> tuples;
17
18 public TextDumpPage(String siteID, String unparsedPageDump) {
19 tuples = new HashMap<String, String>();
20
21 try (
22 BufferedReader reader = new BufferedReader(new StringReader(unparsedPageDump));
23 ) {
24
25 String line = reader.readLine(); // should have at least first line
26
27 // first line always has a "key:" somewhere after the pageURL
28 int endIndex = line.indexOf("key:");
29
30 String pageURL = line.substring(0, endIndex);
31 //String pageURL = (endIndex == -1) ? line : line.substring(0, endIndex);
32
33
34 tuples.put("pageURL", pageURL.trim());
35
36 //if(endIndex != -1) {
37 String key = line.substring(endIndex);
38 tuples.put("key", key.trim());
39 //} else {
40 //logger.debug("@@@@ no key for pageURL: " + pageURL);
41 //}
42 /*
43 if(pageURL.contains(TEXT_END_MARKER)) {
44 logger.debug("@@@@ TEXT_END_MARKER assigned to pageURL for page: ");
45 logger.debug("+++++++++");
46 logger.debug(unparsedPageDump);
47 logger.debug("+++++++++");
48 }
49 */
50
51 boolean readingPageText = false;
52 StringBuilder pageText = null;
53
54 // continue reading all other tuples for this page, if any
55 while((line = reader.readLine()) != null) {
56 line = line.trim();
57
58 // check if we're dealing with metadata or start/end of page's text body
59 // or actual text body
60
61 if(line.equals(TEXT_START_MARKER)) { // dealing with the page body text
62 pageText = new StringBuilder();
63 readingPageText = true;
64 }
65 else if(line.equals(TEXT_END_MARKER)) {
66 // finished with a page body
67 // Remove any FINAL artificial newline we introduced to a page's body text
68 tuples.put("pageText", pageText.toString().trim());
69 readingPageText = false;
70 pageText = null;
71 }
72 else {
73 if(readingPageText) { // So we're reading in the page text
74 pageText.append(line);
75 pageText.append("\n"); // there are no newlines within pageText
76 // but if there were newlines, add them back here as readLine() removes them
77 }
78 else { // dealing with the rest of the page dump's metadata
79 // QTODO: nutch's text dump output is problematic
80 // strange characters are in the stream and end up here
81 // and can't detect end of metadata or even end of line.
82 endIndex = line.indexOf(":");
83 if(endIndex != -1) {
84 String k = line.substring(0, endIndex);
85 String v = line.substring(endIndex+1);
86 if(k.startsWith("metadata")) {
87 k = k.substring("metadata".length());
88 }
89
90 tuples.put(k.trim(), v.trim());
91 } else {
92 if(NutchTextDumpProcessor.DEBUG_MODE) {
93 logger.error("No meta key for meta: " + line);
94 logger.error(unparsedPageDump);
95 }
96 }
97 }
98 }
99
100 }
101
102 // If the page had no pageText, add a "pageText" -> "" mapping
103 if(!tuples.containsKey("pageText")) {
104 tuples.put("pageText", "");
105 }
106
107
108 } catch (IOException ioe) {
109 logger.error("@@@@@@@@@ Error reading in txtdump of a page.", ioe);
110 }
111
112
113 // START DEBUG
114 debugTuples();
115 // END DEBUG
116
117 }
118
119 public void debugTuples() {
120 if(NutchTextDumpProcessor.DEBUG_MODE) {
121 logger.debug("__________________________________________");
122 for(Map.Entry<String, String> entry : tuples.entrySet()) {
123 String key = entry.getKey();
124 String value = entry.getValue();
125 logger.debug(key + " - " + value);
126 }
127 logger.debug("__________________________________________");
128 }
129 }
130
131
132 public String getPageURL() {
133 return tuples.get("pageURL");
134 }
135
136 public String getPageText() {
137 return tuples.get("pageText");
138 }
139
140 /* Dr Nichols suggested storing timestamp and char encoding. Not sure which timestamp
141 or encoding he meant, but storing 2 of several timestamps and selecting
142 original character encoding (presumably the char encoding of the page) out of 2
143 pieces of char encoding metadata to store. */
144 public String getModifiedTime() {
145 // is this the webpage's last mod time?
146 String time = tuples.get("modifiedTime");
147 time = time.equals("0") ? "" : time; // zero will be assumed to be epoch, rather than unset
148 return time;
149 }
150 public String getFetchTime() {
151 // is this the nutch crawl time
152 String time = tuples.get("fetchTime");
153 time = time.equals("0") ? "" : time; // zero will be assumed to be epoch, rather than unset
154 return time;
155
156 }
157 public String getOriginalCharEncoding() {
158 // is this the web page's char-encoding?
159 return tuples.get("OriginalCharEncoding");
160 }
161
162 public String get(String key) {
163 return tuples.get(key);
164 }
165
166 public void add(String key, String value) {
167 tuples.put(key, value);
168 }
169
170 public void addMRILanguageStatus(boolean status) {
171 if(status) {
172 tuples.put("isMRI", "true");
173 } else {
174 tuples.put("isMRI", "false");
175 }
176 }
177
178 public boolean getMRILanguageStatus() {
179 String value = tuples.get("isMRI");
180 if(value == null) {
181 return false;
182 }
183 if(value.equals("true")) {
184 return true;
185 }
186 else {
187 return false;
188 }
189
190 }
191
192}
Note: See TracBrowser for help on using the repository browser.