source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java@ 33576

Last change on this file since 33576 was 33576, checked in by ak19, 5 years ago

Introducing 2 new Java files still being written and untested. NutchTextDumpProcessor which uses TextDumpPage to parse the text dump in dump.txt of each site crawled by nutch.

File size: 4.0 KB
Line 
1package org.greenstone.atea;
2
3import java.io.*;
4import java.util.HashMap;
5import java.util.Map;
6import java.lang.ArrayIndexOutOfBoundsException;
7
8public class NutchTextDumpProcessor {
9 private static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpProcessor.class.getName());
10
11 private static MaoriTextDetector maoriTxtDetector = new MaoriTextDetector(false); // false: run non-silent
12
13 public final String siteID; // is this necessary?
14
15 /** keep a list to store the text of each page */
16 private ArrayList<TextDumpPage> pages;
17
18
19 public NutchTextDumpProcessor(String siteID, File txtDumpFile) {
20 // siteID is of the form %5d (e.g. 00020) and is just the name of a site folder
21 this.siteID = siteID;
22
23
24 pages = new ArrayList<TextDumpPage>();
25
26 String line = null;
27 StringBuilder pageDump = new StringBuilder();
28 try (
29 BufferedReader reader = new BufferedReader(new FileReader(txtDumpFile));
30 ) {
31
32 while((line = reader.readLine()) != null) { // readLine removes newline separator
33 line = line.trim();
34 // an empty line marks the end of a page in nutch's text dump of a site
35 if(!line.equals("")) {
36 pageDump.append(line);
37 pageDump.append("\n");
38 } else {
39 TextDumpPage page = new TextDumpPage(pageDump.toString());
40 // parses the fields and body text of a webpage in nutch's txt dump of entire site
41 //page.parseFields();
42 //page.getText();
43 pages.add(page);
44 pageDump = null;
45 pageDump = new StringBuilder();
46 }
47 }
48
49 } catch (IOException ioe) {
50 error("@@@@@@@@@ Error reading in nutch txtdump file " + txtDumpFile, ioe);
51 }
52
53 }
54
55 /** pageID: id into pages array */
56 public boolean isPageInMaori(int pageID) throws ArrayIndexOutOfBoundsException {
57
58 String text = getTextForPage(pageID);
59 return maoriTxtDetector.isTextInMaori(text);
60 }
61
62 private TextDumpPage getPage(int pageID) throws ArrayIndexOutOfBoundsException {
63 if(pageID < 0 || pageID >= pages.size()) {
64 throw new ArrayIndexOutOfBoundsException();
65 }
66
67 TextDumpPage page = pages.get(pageID);
68 return page;
69 }
70
71 public String getTextForPage(int pageID) throws ArrayIndexOutOfBoundsException {
72 TextDumpPage page = getPage(pageID);
73 return page.getPageText();
74 }
75 public String getURLForPage(int pageID) throws ArrayIndexOutOfBoundsException {
76 TextDumpPage page = getPage(pageID);
77 return page.getPageURL();
78 }
79
80
81 // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
82 public static void info(String msg) {
83 System.err.println(msg);
84 logger.info(msg);
85 }
86 public static void debug(String msg) {
87 System.err.println(msg);
88 logger.debug(msg);
89 }
90 public static void warn(String msg) {
91 System.err.println(msg);
92 logger.warn(msg);
93 }
94 public static void error(String msg) {
95 System.err.println(msg);
96 logger.error(msg);
97 }
98 public static void error(String msg, Exception e) {
99 logger.error(msg, e);
100 System.err.println("\n"+msg);
101 e.printStackTrace();
102 }
103
104 public static void printUsage() {
105 info("Run this program as:");
106 info("\tNutchTextDumpProcessor <path to 'sites' folder>");
107 }
108
109 public static void main(String[] args) {
110 if(args.length != 1) {
111 printUsage();
112 return;
113 }
114
115 File sitesDir = new File(args[0]);
116 if(!sitesDir.exists() || !sitesDir.isDirectory()) {
117 error("Error: " + args[0] + " does not exist or is not a directory");
118 return;
119 }
120
121 try {
122 File[] sites = sitesDir.listFiles();
123 for(File siteDir : sites) { // e.g. 00001
124 // look for dump.txt
125 File txtDumpFile = new File(siteDir, dump.txt);
126 if(!txtDumpFile.exists()) {
127 error("Text dump file " + txtDumpFile + " did not exist");
128 continue;
129 }
130
131 else {
132 String siteID = siteDir.getName();
133 NutchTextDumpProcessor nutchTxtDump = NutchTextDumpProcessor(siteID, txtDumpFile);
134
135 }
136
137 }
138
139 } catch(Exception e) {
140 // can get an exception when instantiating CCWETProcessor instance
141 error(e.getMessage(), e);
142 }
143 }
144}
Note: See TracBrowser for help on using the repository browser.