source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/TextDumpPage.java@ 33578

Last change on this file since 33578 was 33578, checked in by ak19, 5 years ago

Corrections for compiling the 2 new classes.

File size: 3.2 KB
Line 
1package org.greenstone.atea;
2
3import java.io.*;
4import java.util.HashMap;
5import java.util.Map;
6
7import org.apache.log4j.Logger;
8
9
10public class TextDumpPage {
11 private static Logger logger = Logger.getLogger(org.greenstone.atea.TextDumpPage.class.getName());
12
13 public static final String TEXT_START_MARKER="text:start:";
14 public static final String TEXT_END_MARKER="text:end:";
15
16 private Map<String, String> tuples;
17
18 public TextDumpPage(String siteID, String unparsedPageDump) {
19 tuples = new HashMap<String, String>();
20
21 try (
22 BufferedReader reader = new BufferedReader(new StringReader(unparsedPageDump));
23 ) {
24
25 String line = reader.readLine(); // should have at least first line
26
27 // first line always has a "key:" somewhere after the pageURL
28 int endIndex = line.indexOf("key:");
29 String pageURL = line.substring(endIndex);
30
31 tuples.put("pageURL", pageURL.trim());
32
33 String key = line.substring(endIndex);
34 tuples.put("key", key.trim());
35
36 boolean readingPageText = false;
37 StringBuilder pageText = null;
38
39 // continue reading all other tuples for this page, if any
40 while((line = reader.readLine()) != null) {
41
42 if(!readingPageText) {
43 // check if we're dealing with metadata or start/end of pagetext
44 endIndex = line.indexOf(":");
45 if(endIndex != -1) { // dealing with the rest of the page dump's metadata
46 String k = line.substring(0, endIndex);
47 String v = line.substring(endIndex+1);
48 tuples.put(k.trim(), v.trim());
49 }
50
51 else if(line.equals(TEXT_START_MARKER)) { // dealing with the page body text
52 pageText = new StringBuilder();
53 readingPageText = true;
54 }
55 }
56
57 else { // we're reading in the page text
58
59 if(line.equals(TEXT_END_MARKER)) {
60 // finished with a page body
61 // remove any FINAL artificial newline we introduced
62 tuples.put("pageText", pageText.toString().trim());
63 readingPageText = false;
64 pageText = null;
65 }
66 else {
67 pageText.append(line);
68 pageText.append("\n"); // there are no newlines within pageText
69 // but if there were newlines, add them back here as readLine() removes them
70 }
71
72 }
73 }
74
75 // If the page had no pageText, add a "pageText" -> "" mapping
76 if(!tuples.containsKey("pageText")) {
77 tuples.put("pageText", "");
78 }
79
80 } catch (IOException ioe) {
81 error("@@@@@@@@@ Error reading in txtdump of a page.", ioe);
82 }
83 }
84
85
86 public String getPageURL() {
87 return tuples.get("url");
88 }
89
90 public String getPageText() {
91 return tuples.get("pageText");
92 }
93
94 public String get(String key) {
95 return tuples.get(key);
96 }
97
98 // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
99 public static void info(String msg) {
100 System.err.println(msg);
101 logger.info(msg);
102 }
103 public static void debug(String msg) {
104 System.err.println(msg);
105 logger.debug(msg);
106 }
107 public static void warn(String msg) {
108 System.err.println(msg);
109 logger.warn(msg);
110 }
111 public static void error(String msg) {
112 System.err.println(msg);
113 logger.error(msg);
114 }
115 public static void error(String msg, Exception e) {
116 logger.error(msg, e);
117 System.err.println("\n"+msg);
118 e.printStackTrace();
119 }
120
121}
Note: See TracBrowser for help on using the repository browser.