1 | package org.greenstone.atea;
|
---|
2 |
|
---|
3 | import java.io.*;
|
---|
4 | import java.util.HashMap;
|
---|
5 | import java.util.Map;
|
---|
6 | import java.lang.ArrayIndexOutOfBoundsException;
|
---|
7 |
|
---|
8 | public class NutchTextDumpProcessor {
|
---|
9 | private static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpProcessor.class.getName());
|
---|
10 |
|
---|
11 | private static MaoriTextDetector maoriTxtDetector = new MaoriTextDetector(false); // false: run non-silent
|
---|
12 |
|
---|
13 | public final String siteID; // is this necessary?
|
---|
14 |
|
---|
15 | /** keep a list to store the text of each page */
|
---|
16 | private ArrayList<TextDumpPage> pages;
|
---|
17 |
|
---|
18 |
|
---|
19 | public NutchTextDumpProcessor(String siteID, File txtDumpFile) {
|
---|
20 | // siteID is of the form %5d (e.g. 00020) and is just the name of a site folder
|
---|
21 | this.siteID = siteID;
|
---|
22 |
|
---|
23 |
|
---|
24 | pages = new ArrayList<TextDumpPage>();
|
---|
25 |
|
---|
26 | String line = null;
|
---|
27 | StringBuilder pageDump = new StringBuilder();
|
---|
28 | try (
|
---|
29 | BufferedReader reader = new BufferedReader(new FileReader(txtDumpFile));
|
---|
30 | ) {
|
---|
31 |
|
---|
32 | while((line = reader.readLine()) != null) { // readLine removes newline separator
|
---|
33 | line = line.trim();
|
---|
34 | // an empty line marks the end of a page in nutch's text dump of a site
|
---|
35 | if(!line.equals("")) {
|
---|
36 | pageDump.append(line);
|
---|
37 | pageDump.append("\n");
|
---|
38 | } else {
|
---|
39 | TextDumpPage page = new TextDumpPage(pageDump.toString());
|
---|
40 | // parses the fields and body text of a webpage in nutch's txt dump of entire site
|
---|
41 | //page.parseFields();
|
---|
42 | //page.getText();
|
---|
43 | pages.add(page);
|
---|
44 | pageDump = null;
|
---|
45 | pageDump = new StringBuilder();
|
---|
46 | }
|
---|
47 | }
|
---|
48 |
|
---|
49 | } catch (IOException ioe) {
|
---|
50 | error("@@@@@@@@@ Error reading in nutch txtdump file " + txtDumpFile, ioe);
|
---|
51 | }
|
---|
52 |
|
---|
53 | }
|
---|
54 |
|
---|
55 | /** pageID: id into pages array */
|
---|
56 | public boolean isPageInMaori(int pageID) throws ArrayIndexOutOfBoundsException {
|
---|
57 |
|
---|
58 | String text = getTextForPage(pageID);
|
---|
59 | return maoriTxtDetector.isTextInMaori(text);
|
---|
60 | }
|
---|
61 |
|
---|
62 | private TextDumpPage getPage(int pageID) throws ArrayIndexOutOfBoundsException {
|
---|
63 | if(pageID < 0 || pageID >= pages.size()) {
|
---|
64 | throw new ArrayIndexOutOfBoundsException();
|
---|
65 | }
|
---|
66 |
|
---|
67 | TextDumpPage page = pages.get(pageID);
|
---|
68 | return page;
|
---|
69 | }
|
---|
70 |
|
---|
71 | public String getTextForPage(int pageID) throws ArrayIndexOutOfBoundsException {
|
---|
72 | TextDumpPage page = getPage(pageID);
|
---|
73 | return page.getPageText();
|
---|
74 | }
|
---|
75 | public String getURLForPage(int pageID) throws ArrayIndexOutOfBoundsException {
|
---|
76 | TextDumpPage page = getPage(pageID);
|
---|
77 | return page.getPageURL();
|
---|
78 | }
|
---|
79 |
|
---|
80 |
|
---|
81 | // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
|
---|
82 | public static void info(String msg) {
|
---|
83 | System.err.println(msg);
|
---|
84 | logger.info(msg);
|
---|
85 | }
|
---|
86 | public static void debug(String msg) {
|
---|
87 | System.err.println(msg);
|
---|
88 | logger.debug(msg);
|
---|
89 | }
|
---|
90 | public static void warn(String msg) {
|
---|
91 | System.err.println(msg);
|
---|
92 | logger.warn(msg);
|
---|
93 | }
|
---|
94 | public static void error(String msg) {
|
---|
95 | System.err.println(msg);
|
---|
96 | logger.error(msg);
|
---|
97 | }
|
---|
98 | public static void error(String msg, Exception e) {
|
---|
99 | logger.error(msg, e);
|
---|
100 | System.err.println("\n"+msg);
|
---|
101 | e.printStackTrace();
|
---|
102 | }
|
---|
103 |
|
---|
104 | public static void printUsage() {
|
---|
105 | info("Run this program as:");
|
---|
106 | info("\tNutchTextDumpProcessor <path to 'sites' folder>");
|
---|
107 | }
|
---|
108 |
|
---|
109 | public static void main(String[] args) {
|
---|
110 | if(args.length != 1) {
|
---|
111 | printUsage();
|
---|
112 | return;
|
---|
113 | }
|
---|
114 |
|
---|
115 | File sitesDir = new File(args[0]);
|
---|
116 | if(!sitesDir.exists() || !sitesDir.isDirectory()) {
|
---|
117 | error("Error: " + args[0] + " does not exist or is not a directory");
|
---|
118 | return;
|
---|
119 | }
|
---|
120 |
|
---|
121 | try {
|
---|
122 | File[] sites = sitesDir.listFiles();
|
---|
123 | for(File siteDir : sites) { // e.g. 00001
|
---|
124 | // look for dump.txt
|
---|
125 | File txtDumpFile = new File(siteDir, dump.txt);
|
---|
126 | if(!txtDumpFile.exists()) {
|
---|
127 | error("Text dump file " + txtDumpFile + " did not exist");
|
---|
128 | continue;
|
---|
129 | }
|
---|
130 |
|
---|
131 | else {
|
---|
132 | String siteID = siteDir.getName();
|
---|
133 | NutchTextDumpProcessor nutchTxtDump = NutchTextDumpProcessor(siteID, txtDumpFile);
|
---|
134 |
|
---|
135 | }
|
---|
136 |
|
---|
137 | }
|
---|
138 |
|
---|
139 | } catch(Exception e) {
|
---|
140 | // can get an exception when instantiating CCWETProcessor instance
|
---|
141 | error(e.getMessage(), e);
|
---|
142 | }
|
---|
143 | }
|
---|
144 | }
|
---|