Changeset 33580 for gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java
- Timestamp:
- 2019-10-17T21:44:46+13:00 (5 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java
r33579 r33580 52 52 /** keep a list to store the text of each page */ 53 53 private ArrayList<TextDumpPage> pages; 54 54 55 private boolean isStartOfNewWebPageRecord(String prevLine, String line) { 56 // The start of a new web page's record in nutch's text dump of an entire site 57 // is denoted by a newline followed by a URL (protocol) 58 // or the very start of the file with a URL (protocol) 59 return ((prevLine == null || prevLine.equals("")) 60 && (line.startsWith("http://") || line.startsWith("https://"))); 61 } 62 63 public void debugPageDump(StringBuilder pageDump) { 64 65 // START DEBUG 66 debug("__________________________________________"); 67 debug("@@@ Found page entry: "); 68 debug("__________________________________________"); 69 debug(pageDump.toString()); 70 debug("------------------------------------------"); 71 // END DEBUG 72 } 55 73 56 74 public NutchTextDumpProcessor(MaoriTextDetector maoriTxtDetector, String siteID, File txtDumpFile) { … … 62 80 63 81 String line = null; 64 StringBuilder pageDump = n ew StringBuilder();82 StringBuilder pageDump = null; 65 83 try ( 66 84 BufferedReader reader = new BufferedReader(new FileReader(txtDumpFile)); … … 68 86 69 87 boolean readingText = false; 88 String prevLine = null; 70 89 71 90 while((line = reader.readLine()) != null) { // readLine removes newline separator … … 76 95 // markers of a page's text, though. 77 96 78 if(!readingText && line.equals("")) { 97 if(isStartOfNewWebPageRecord(prevLine, line)) { 98 99 if(pageDump != null) { // should also be the case then: if(prevLine != null) 100 // finish old pageDump and begin new one 101 102 //debugPageDump(pageDump); 103 104 TextDumpPage page = new TextDumpPage(siteID, pageDump.toString()); 105 // parses the fields and body text of a webpage in nutch's txt dump of entire site 106 //page.parseFields(); 107 //page.getText(); 108 pages.add(page); 109 pageDump = null; 110 111 } 112 113 // begin new webpage dump 114 pageDump = new StringBuilder(); 79 115 pageDump.append(line); 80 116 pageDump.append("\n"); 81 82 117 83 // START DEBUG84 debug("__________________________________________");85 debug("@@@ Found page entry: ");86 debug("__________________________________________");87 debug(pageDump.toString());88 debug("------------------------------------------");89 // END DEBUG90 91 92 TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());93 // parses the fields and body text of a webpage in nutch's txt dump of entire site94 //page.parseFields();95 //page.getText();96 pages.add(page);97 pageDump = null;98 99 pageDump = new StringBuilder();100 118 } 101 else if (!line.equals("")) { // empty line 102 if(line.equals(TextDumpPage.TEXT_START_MARKER)) { 103 readingText = true; 104 } 105 if(line.equals(TextDumpPage.TEXT_END_MARKER)) { 106 readingText = false; 107 } 119 else if(!line.equals("")) { 108 120 pageDump.append(line); 109 121 pageDump.append("\n"); 122 110 123 } 111 124 // can throw away any newlines between text start and end markers. 125 126 prevLine = line; 112 127 } 128 129 // process final webpage record: 130 //debugPageDump(pageDump); 131 132 TextDumpPage page = new TextDumpPage(siteID, pageDump.toString()); 133 pages.add(page); 134 pageDump = null; 113 135 114 136 } catch (IOException ioe) { … … 190 212 File[] sites = sitesDir.listFiles(); 191 213 for(File siteDir : sites) { // e.g. 00001 192 // look for dump.txt 193 File txtDumpFile = new File(siteDir, "dump.txt"); 194 if(!txtDumpFile.exists()) { 195 error("Text dump file " + txtDumpFile + " did not exist"); 196 continue; 197 } 198 199 else { 200 String siteID = siteDir.getName(); 201 debug("Found siteID: " + siteID); 202 NutchTextDumpProcessor nutchTxtDump = new NutchTextDumpProcessor(mriTxtDetector, siteID, txtDumpFile); 214 if(siteDir.isDirectory()) { 215 // look for dump.txt 216 File txtDumpFile = new File(siteDir, "dump.txt"); 217 if(!txtDumpFile.exists()) { 218 error("Text dump file " + txtDumpFile + " did not exist"); 219 continue; 220 } 221 222 else { 223 String siteID = siteDir.getName(); 224 debug("Found siteID: " + siteID); 225 NutchTextDumpProcessor nutchTxtDump = new NutchTextDumpProcessor(mriTxtDetector, siteID, txtDumpFile); 226 } 203 227 } 204 228
Note:
See TracChangeset
for help on using the changeset viewer.