Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java@ 33576

Last change on this file since 33576 was 33576, checked in by ak19, 5 years ago
Introducing 2 new Java files still being written and untested. NutchTextDumpProcessor which uses TextDumpPage to parse the text dump in dump.txt of each site crawled by nutch.
File size: 4.0 KB

Line
1	package org.greenstone.atea;
2
3	import java.io.*;
4	import java.util.HashMap;
5	import java.util.Map;
6	import java.lang.ArrayIndexOutOfBoundsException;
7
8	public class NutchTextDumpProcessor {
9	private static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpProcessor.class.getName());
10
11	private static MaoriTextDetector maoriTxtDetector = new MaoriTextDetector(false); // false: run non-silent
12
13	public final String siteID; // is this necessary?
14
15	/** keep a list to store the text of each page */
16	private ArrayList<TextDumpPage> pages;
17
18
19	public NutchTextDumpProcessor(String siteID, File txtDumpFile) {
20	// siteID is of the form %5d (e.g. 00020) and is just the name of a site folder
21	this.siteID = siteID;
22
23
24	pages = new ArrayList<TextDumpPage>();
25
26	String line = null;
27	StringBuilder pageDump = new StringBuilder();
28	try (
29	BufferedReader reader = new BufferedReader(new FileReader(txtDumpFile));
30	) {
31
32	while((line = reader.readLine()) != null) { // readLine removes newline separator
33	line = line.trim();
34	// an empty line marks the end of a page in nutch's text dump of a site
35	if(!line.equals("")) {
36	pageDump.append(line);
37	pageDump.append("\n");
38	} else {
39	TextDumpPage page = new TextDumpPage(pageDump.toString());
40	// parses the fields and body text of a webpage in nutch's txt dump of entire site
41	//page.parseFields();
42	//page.getText();
43	pages.add(page);
44	pageDump = null;
45	pageDump = new StringBuilder();
46	}
47	}
48
49	} catch (IOException ioe) {
50	error("@@@@@@@@@ Error reading in nutch txtdump file " + txtDumpFile, ioe);
51	}
52
53	}
54
55	/** pageID: id into pages array */
56	public boolean isPageInMaori(int pageID) throws ArrayIndexOutOfBoundsException {
57
58	String text = getTextForPage(pageID);
59	return maoriTxtDetector.isTextInMaori(text);
60	}
61
62	private TextDumpPage getPage(int pageID) throws ArrayIndexOutOfBoundsException {
63	if(pageID < 0 \|\| pageID >= pages.size()) {
64	throw new ArrayIndexOutOfBoundsException();
65	}
66
67	TextDumpPage page = pages.get(pageID);
68	return page;
69	}
70
71	public String getTextForPage(int pageID) throws ArrayIndexOutOfBoundsException {
72	TextDumpPage page = getPage(pageID);
73	return page.getPageText();
74	}
75	public String getURLForPage(int pageID) throws ArrayIndexOutOfBoundsException {
76	TextDumpPage page = getPage(pageID);
77	return page.getPageURL();
78	}
79
80
81	// --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
82	public static void info(String msg) {
83	System.err.println(msg);
84	logger.info(msg);
85	}
86	public static void debug(String msg) {
87	System.err.println(msg);
88	logger.debug(msg);
89	}
90	public static void warn(String msg) {
91	System.err.println(msg);
92	logger.warn(msg);
93	}
94	public static void error(String msg) {
95	System.err.println(msg);
96	logger.error(msg);
97	}
98	public static void error(String msg, Exception e) {
99	logger.error(msg, e);
100	System.err.println("\n"+msg);
101	e.printStackTrace();
102	}
103
104	public static void printUsage() {
105	info("Run this program as:");
106	info("\tNutchTextDumpProcessor <path to 'sites' folder>");
107	}
108
109	public static void main(String[] args) {
110	if(args.length != 1) {
111	printUsage();
112	return;
113	}
114
115	File sitesDir = new File(args[0]);
116	if(!sitesDir.exists() \|\| !sitesDir.isDirectory()) {
117	error("Error: " + args[0] + " does not exist or is not a directory");
118	return;
119	}
120
121	try {
122	File[] sites = sitesDir.listFiles();
123	for(File siteDir : sites) { // e.g. 00001
124	// look for dump.txt
125	File txtDumpFile = new File(siteDir, dump.txt);
126	if(!txtDumpFile.exists()) {
127	error("Text dump file " + txtDumpFile + " did not exist");
128	continue;
129	}
130
131	else {
132	String siteID = siteDir.getName();
133	NutchTextDumpProcessor nutchTxtDump = NutchTextDumpProcessor(siteID, txtDumpFile);
134
135	}
136
137	}
138
139	} catch(Exception e) {
140	// can get an exception when instantiating CCWETProcessor instance
141	error(e.getMessage(), e);
142	}
143	}
144	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: