source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/TextDumpPage.java@ 33582

Last change on this file since 33582 was 33582, checked in by ak19, 5 years ago

NutchTextDumpProcessor prints each crawled site's stats: number of webpages per crawled site and how many of those were detected by OpenNLP as being in Maori (mri). Needed to make a reusable method in CCWETProcessor as public and static.

File size: 4.9 KB
Line 
1package org.greenstone.atea;
2
3import java.io.*;
4import java.util.HashMap;
5import java.util.Map;
6
7import org.apache.log4j.Logger;
8
9
10public class TextDumpPage {
11 private static Logger logger = Logger.getLogger(org.greenstone.atea.TextDumpPage.class.getName());
12
13 public static final String TEXT_START_MARKER="text:start:";
14 public static final String TEXT_END_MARKER="text:end:";
15
16 private Map<String, String> tuples;
17
18 public TextDumpPage(String siteID, String unparsedPageDump) {
19 tuples = new HashMap<String, String>();
20
21 try (
22 BufferedReader reader = new BufferedReader(new StringReader(unparsedPageDump));
23 ) {
24
25 String line = reader.readLine(); // should have at least first line
26
27 // first line always has a "key:" somewhere after the pageURL
28 int endIndex = line.indexOf("key:");
29
30 String pageURL = line.substring(0, endIndex);
31 //String pageURL = (endIndex == -1) ? line : line.substring(0, endIndex);
32
33
34 tuples.put("pageURL", pageURL.trim());
35
36 //if(endIndex != -1) {
37 String key = line.substring(endIndex);
38 tuples.put("key", key.trim());
39 //} else {
40 //debug("@@@@ no key for pageURL: " + pageURL);
41 //}
42 /*
43 if(pageURL.contains(TEXT_END_MARKER)) {
44 debug("@@@@ TEXT_END_MARKER assigned to pageURL for page: ");
45 debug("+++++++++");
46 debug(unparsedPageDump);
47 debug("+++++++++");
48 }
49 */
50
51 boolean readingPageText = false;
52 StringBuilder pageText = null;
53
54 // continue reading all other tuples for this page, if any
55 while((line = reader.readLine()) != null) {
56 line = line.trim();
57
58 // check if we're dealing with metadata or start/end of page's text body
59 // or actual text body
60
61 if(line.equals(TEXT_START_MARKER)) { // dealing with the page body text
62 pageText = new StringBuilder();
63 readingPageText = true;
64 }
65 else if(line.equals(TEXT_END_MARKER)) {
66 // finished with a page body
67 // Remove any FINAL artificial newline we introduced to a page's body text
68 tuples.put("pageText", pageText.toString().trim());
69 readingPageText = false;
70 pageText = null;
71 }
72 else {
73 if(readingPageText) { // So we're reading in the page text
74 pageText.append(line);
75 pageText.append("\n"); // there are no newlines within pageText
76 // but if there were newlines, add them back here as readLine() removes them
77 }
78 else { // dealing with the rest of the page dump's metadata
79 // QTODO: nutch's text dump output is problematic
80 // strange characters are in the stream and end up here
81 // and can't detect end of metadata or even end of line.
82 endIndex = line.indexOf(":");
83 if(endIndex != -1) {
84 String k = line.substring(0, endIndex);
85 String v = line.substring(endIndex+1);
86 tuples.put(k.trim(), v.trim());
87 } else {
88 if(NutchTextDumpProcessor.DEBUG_MODE) {
89 error("No meta key for meta: " + line);
90 error(unparsedPageDump);
91 }
92 }
93 }
94 }
95
96 }
97
98 // If the page had no pageText, add a "pageText" -> "" mapping
99 if(!tuples.containsKey("pageText")) {
100 tuples.put("pageText", "");
101 }
102
103
104 } catch (IOException ioe) {
105 error("@@@@@@@@@ Error reading in txtdump of a page.", ioe);
106 }
107
108
109 // START DEBUG
110 debugTuples();
111 // END DEBUG
112
113 }
114
115 public void debugTuples() {
116 if(NutchTextDumpProcessor.DEBUG_MODE) {
117 debug("__________________________________________");
118 for(Map.Entry<String, String> entry : tuples.entrySet()) {
119 String key = entry.getKey();
120 String value = entry.getValue();
121 debug(key + " - " + value);
122 }
123 debug("__________________________________________");
124 }
125 }
126
127
128 public String getPageURL() {
129 return tuples.get("pageURL");
130 }
131
132 public String getPageText() {
133 return tuples.get("pageText");
134 }
135
136 public String get(String key) {
137 return tuples.get(key);
138 }
139
140 public void add(String key, String value) {
141 tuples.put(key, value);
142 }
143
144 public void addMRILanguageStatus(boolean status) {
145 if(status) {
146 tuples.put("isMRI", "true");
147 } else {
148 tuples.put("isMRI", "false");
149 }
150 }
151
152 public boolean getMRILanguageStatus() {
153 String value = tuples.get("isMRI");
154 if(value == null) {
155 return false;
156 }
157 if(value.equals("true")) {
158 return true;
159 }
160 else {
161 return false;
162 }
163
164 }
165
166 // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
167 public static void info(String msg) {
168 System.err.println(msg);
169 logger.info(msg);
170 }
171 public static void debug(String msg) {
172 System.err.println(msg);
173 logger.debug(msg);
174 }
175 public static void warn(String msg) {
176 System.err.println(msg);
177 logger.warn(msg);
178 }
179 public static void error(String msg) {
180 System.err.println(msg);
181 logger.error(msg);
182 }
183 public static void error(String msg, Exception e) {
184 logger.error(msg, e);
185 System.err.println("\n"+msg);
186 e.printStackTrace();
187 }
188
189}
Note: See TracBrowser for help on using the repository browser.