source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/TextDumpPage.java@ 33580

Last change on this file since 33580 was 33580, checked in by ak19, 5 years ago

Finally fixed the thus-far identified bugs when parsing dump.txt.

File size: 4.1 KB
Line 
1package org.greenstone.atea;
2
3import java.io.*;
4import java.util.HashMap;
5import java.util.Map;
6
7import org.apache.log4j.Logger;
8
9
10public class TextDumpPage {
11 private static Logger logger = Logger.getLogger(org.greenstone.atea.TextDumpPage.class.getName());
12
13 public static final String TEXT_START_MARKER="text:start:";
14 public static final String TEXT_END_MARKER="text:end:";
15
16 private Map<String, String> tuples;
17
18 public TextDumpPage(String siteID, String unparsedPageDump) {
19 tuples = new HashMap<String, String>();
20
21 try (
22 BufferedReader reader = new BufferedReader(new StringReader(unparsedPageDump));
23 ) {
24
25 String line = reader.readLine(); // should have at least first line
26
27 // first line always has a "key:" somewhere after the pageURL
28 int endIndex = line.indexOf("key:");
29
30 String pageURL = line.substring(0, endIndex);
31 //String pageURL = (endIndex == -1) ? line : line.substring(0, endIndex);
32
33
34 tuples.put("pageURL", pageURL.trim());
35
36 //if(endIndex != -1) {
37 String key = line.substring(endIndex);
38 tuples.put("key", key.trim());
39 //} else {
40 //debug("@@@@ no key for pageURL: " + pageURL);
41 //}
42 /*
43 if(pageURL.contains(TEXT_END_MARKER)) {
44 debug("@@@@ TEXT_END_MARKER assigned to pageURL for page: ");
45 debug("+++++++++");
46 debug(unparsedPageDump);
47 debug("+++++++++");
48 }
49 */
50
51 boolean readingPageText = false;
52 StringBuilder pageText = null;
53
54 // continue reading all other tuples for this page, if any
55 while((line = reader.readLine()) != null) {
56 line = line.trim();
57
58 // check if we're dealing with metadata or start/end of page's text body
59 // or actual text body
60
61 if(line.equals(TEXT_START_MARKER)) { // dealing with the page body text
62 pageText = new StringBuilder();
63 readingPageText = true;
64 }
65 else if(line.equals(TEXT_END_MARKER)) {
66 // finished with a page body
67 // Remove any FINAL artificial newline we introduced to a page's body text
68 tuples.put("pageText", pageText.toString().trim());
69 readingPageText = false;
70 pageText = null;
71 }
72 else {
73 if(readingPageText) { // So we're reading in the page text
74 pageText.append(line);
75 pageText.append("\n"); // there are no newlines within pageText
76 // but if there were newlines, add them back here as readLine() removes them
77 }
78 else { // dealing with the rest of the page dump's metadata
79 endIndex = line.indexOf(":");
80 if(endIndex != -1) {
81 String k = line.substring(0, endIndex);
82 String v = line.substring(endIndex+1);
83 tuples.put(k.trim(), v.trim());
84 } else {
85 error("No meta key for meta: " + line);
86 }
87 }
88 }
89
90 }
91
92 // If the page had no pageText, add a "pageText" -> "" mapping
93 if(!tuples.containsKey("pageText")) {
94 tuples.put("pageText", "");
95 }
96
97
98 } catch (IOException ioe) {
99 error("@@@@@@@@@ Error reading in txtdump of a page.", ioe);
100 }
101
102
103 // START DEBUG
104 debug("__________________________________________");
105 for(Map.Entry<String, String> entry : tuples.entrySet()) {
106 String key = entry.getKey();
107 String value = entry.getValue();
108 debug(key + " - " + value);
109 }
110 debug("__________________________________________");
111 // END DEBUG
112
113 }
114
115
116 public String getPageURL() {
117 return tuples.get("url");
118 }
119
120 public String getPageText() {
121 return tuples.get("pageText");
122 }
123
124 public String get(String key) {
125 return tuples.get(key);
126 }
127
128 // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
129 public static void info(String msg) {
130 System.err.println(msg);
131 logger.info(msg);
132 }
133 public static void debug(String msg) {
134 System.err.println(msg);
135 logger.debug(msg);
136 }
137 public static void warn(String msg) {
138 System.err.println(msg);
139 logger.warn(msg);
140 }
141 public static void error(String msg) {
142 System.err.println(msg);
143 logger.error(msg);
144 }
145 public static void error(String msg, Exception e) {
146 logger.error(msg, e);
147 System.err.println("\n"+msg);
148 e.printStackTrace();
149 }
150
151}
Note: See TracBrowser for help on using the repository browser.