source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/TextDumpPage.java@ 33615

Last change on this file since 33615 was 33615, checked in by ak19, 4 years ago
  1. Worked out how to configure log4j to log both to console and logfile, so modified the existing laboured code to use this better way. 2. Added some Mongodb links under MoreReading.
File size: 4.3 KB
Line 
1package org.greenstone.atea;
2
3import java.io.*;
4import java.util.HashMap;
5import java.util.Map;
6
7import org.apache.log4j.Logger;
8
9
10public class TextDumpPage {
11 private static Logger logger = Logger.getLogger(org.greenstone.atea.TextDumpPage.class.getName());
12
13 public static final String TEXT_START_MARKER="text:start:";
14 public static final String TEXT_END_MARKER="text:end:";
15
16 private Map<String, String> tuples;
17
18 public TextDumpPage(String siteID, String unparsedPageDump) {
19 tuples = new HashMap<String, String>();
20
21 try (
22 BufferedReader reader = new BufferedReader(new StringReader(unparsedPageDump));
23 ) {
24
25 String line = reader.readLine(); // should have at least first line
26
27 // first line always has a "key:" somewhere after the pageURL
28 int endIndex = line.indexOf("key:");
29
30 String pageURL = line.substring(0, endIndex);
31 //String pageURL = (endIndex == -1) ? line : line.substring(0, endIndex);
32
33
34 tuples.put("pageURL", pageURL.trim());
35
36 //if(endIndex != -1) {
37 String key = line.substring(endIndex);
38 tuples.put("key", key.trim());
39 //} else {
40 //logger.debug("@@@@ no key for pageURL: " + pageURL);
41 //}
42 /*
43 if(pageURL.contains(TEXT_END_MARKER)) {
44 logger.debug("@@@@ TEXT_END_MARKER assigned to pageURL for page: ");
45 logger.debug("+++++++++");
46 logger.debug(unparsedPageDump);
47 logger.debug("+++++++++");
48 }
49 */
50
51 boolean readingPageText = false;
52 StringBuilder pageText = null;
53
54 // continue reading all other tuples for this page, if any
55 while((line = reader.readLine()) != null) {
56 line = line.trim();
57
58 // check if we're dealing with metadata or start/end of page's text body
59 // or actual text body
60
61 if(line.equals(TEXT_START_MARKER)) { // dealing with the page body text
62 pageText = new StringBuilder();
63 readingPageText = true;
64 }
65 else if(line.equals(TEXT_END_MARKER)) {
66 // finished with a page body
67 // Remove any FINAL artificial newline we introduced to a page's body text
68 tuples.put("pageText", pageText.toString().trim());
69 readingPageText = false;
70 pageText = null;
71 }
72 else {
73 if(readingPageText) { // So we're reading in the page text
74 pageText.append(line);
75 pageText.append("\n"); // there are no newlines within pageText
76 // but if there were newlines, add them back here as readLine() removes them
77 }
78 else { // dealing with the rest of the page dump's metadata
79 // QTODO: nutch's text dump output is problematic
80 // strange characters are in the stream and end up here
81 // and can't detect end of metadata or even end of line.
82 endIndex = line.indexOf(":");
83 if(endIndex != -1) {
84 String k = line.substring(0, endIndex);
85 String v = line.substring(endIndex+1);
86 tuples.put(k.trim(), v.trim());
87 } else {
88 if(NutchTextDumpProcessor.DEBUG_MODE) {
89 logger.error("No meta key for meta: " + line);
90 logger.error(unparsedPageDump);
91 }
92 }
93 }
94 }
95
96 }
97
98 // If the page had no pageText, add a "pageText" -> "" mapping
99 if(!tuples.containsKey("pageText")) {
100 tuples.put("pageText", "");
101 }
102
103
104 } catch (IOException ioe) {
105 logger.error("@@@@@@@@@ Error reading in txtdump of a page.", ioe);
106 }
107
108
109 // START DEBUG
110 debugTuples();
111 // END DEBUG
112
113 }
114
115 public void debugTuples() {
116 if(NutchTextDumpProcessor.DEBUG_MODE) {
117 logger.debug("__________________________________________");
118 for(Map.Entry<String, String> entry : tuples.entrySet()) {
119 String key = entry.getKey();
120 String value = entry.getValue();
121 logger.debug(key + " - " + value);
122 }
123 logger.debug("__________________________________________");
124 }
125 }
126
127
128 public String getPageURL() {
129 return tuples.get("pageURL");
130 }
131
132 public String getPageText() {
133 return tuples.get("pageText");
134 }
135
136 public String get(String key) {
137 return tuples.get(key);
138 }
139
140 public void add(String key, String value) {
141 tuples.put(key, value);
142 }
143
144 public void addMRILanguageStatus(boolean status) {
145 if(status) {
146 tuples.put("isMRI", "true");
147 } else {
148 tuples.put("isMRI", "false");
149 }
150 }
151
152 public boolean getMRILanguageStatus() {
153 String value = tuples.get("isMRI");
154 if(value == null) {
155 return false;
156 }
157 if(value.equals("true")) {
158 return true;
159 }
160 else {
161 return false;
162 }
163
164 }
165
166}
Note: See TracBrowser for help on using the repository browser.