source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/TextDumpPage.java@ 33579

Last change on this file since 33579 was 33579, checked in by ak19, 5 years ago

Debugging. Solved one problem.

File size: 3.9 KB
Line 
1package org.greenstone.atea;
2
3import java.io.*;
4import java.util.HashMap;
5import java.util.Map;
6
7import org.apache.log4j.Logger;
8
9
10public class TextDumpPage {
11 private static Logger logger = Logger.getLogger(org.greenstone.atea.TextDumpPage.class.getName());
12
13 public static final String TEXT_START_MARKER="text:start:";
14 public static final String TEXT_END_MARKER="text:end:";
15
16 private Map<String, String> tuples;
17
18 public TextDumpPage(String siteID, String unparsedPageDump) {
19 tuples = new HashMap<String, String>();
20
21 try (
22 BufferedReader reader = new BufferedReader(new StringReader(unparsedPageDump));
23 ) {
24
25 String line = reader.readLine(); // should have at least first line
26
27 // first line always has a "key:" somewhere after the pageURL
28 int endIndex = line.indexOf("key:");
29
30 //String pageURL = line.substring(0, endIndex);
31 String pageURL = (endIndex == -1) ? line : line.substring(0, endIndex);
32
33
34 tuples.put("pageURL", pageURL.trim());
35
36 if(endIndex != -1) {
37 String key = line.substring(endIndex);
38 tuples.put("key", key.trim());
39 } else {
40 debug("@@@@ no key for pageURL: " + pageURL);
41 }
42
43 if(pageURL.contains(TEXT_END_MARKER)) {
44 debug("@@@@ TEXT_END_MARKER assigned to pageURL for page: ");
45 debug("+++++++++");
46 debug(unparsedPageDump);
47 debug("+++++++++");
48 }
49
50 boolean readingPageText = false;
51 StringBuilder pageText = null;
52
53 // continue reading all other tuples for this page, if any
54 while((line = reader.readLine()) != null) {
55
56 if(!readingPageText) {
57 // check if we're dealing with metadata or start/end of pagetext
58 endIndex = line.indexOf(":");
59 if(endIndex != -1) { // dealing with the rest of the page dump's metadata
60 String k = line.substring(0, endIndex);
61 String v = line.substring(endIndex+1);
62 tuples.put(k.trim(), v.trim());
63 }
64
65 else if(line.equals(TEXT_START_MARKER)) { // dealing with the page body text
66 pageText = new StringBuilder();
67 readingPageText = true;
68 }
69 }
70
71 else { // we're reading in the page text
72
73 if(line.equals(TEXT_END_MARKER)) {
74 // finished with a page body
75 // remove any FINAL artificial newline we introduced
76 tuples.put("pageText", pageText.toString().trim());
77 readingPageText = false;
78 pageText = null;
79 }
80 else {
81 pageText.append(line);
82 pageText.append("\n"); // there are no newlines within pageText
83 // but if there were newlines, add them back here as readLine() removes them
84 }
85
86 }
87 }
88
89 // If the page had no pageText, add a "pageText" -> "" mapping
90 if(!tuples.containsKey("pageText")) {
91 tuples.put("pageText", "");
92 }
93
94
95 } catch (IOException ioe) {
96 error("@@@@@@@@@ Error reading in txtdump of a page.", ioe);
97 }
98
99 /*
100 // START DEBUG
101 debug("__________________________________________");
102 for(Map.Entry<String, String> entry : tuples.entrySet()) {
103 String key = entry.getKey();
104 String value = entry.getValue();
105 debug(key + " - " + value);
106 }
107 debug("__________________________________________");
108 // END DEBUG
109 */
110 }
111
112
113 public String getPageURL() {
114 return tuples.get("url");
115 }
116
117 public String getPageText() {
118 return tuples.get("pageText");
119 }
120
121 public String get(String key) {
122 return tuples.get(key);
123 }
124
125 // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
126 public static void info(String msg) {
127 System.err.println(msg);
128 logger.info(msg);
129 }
130 public static void debug(String msg) {
131 System.err.println(msg);
132 logger.debug(msg);
133 }
134 public static void warn(String msg) {
135 System.err.println(msg);
136 logger.warn(msg);
137 }
138 public static void error(String msg) {
139 System.err.println(msg);
140 logger.error(msg);
141 }
142 public static void error(String msg, Exception e) {
143 logger.error(msg, e);
144 System.err.println("\n"+msg);
145 e.printStackTrace();
146 }
147
148}
Note: See TracBrowser for help on using the repository browser.