Changeset 33580
- Timestamp:
- 2019-10-17T21:44:46+13:00 (4 years ago)
- Location:
- gs3-extensions/maori-lang-detection/src/org/greenstone/atea
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java
r33579 r33580 52 52 /** keep a list to store the text of each page */ 53 53 private ArrayList<TextDumpPage> pages; 54 54 55 private boolean isStartOfNewWebPageRecord(String prevLine, String line) { 56 // The start of a new web page's record in nutch's text dump of an entire site 57 // is denoted by a newline followed by a URL (protocol) 58 // or the very start of the file with a URL (protocol) 59 return ((prevLine == null || prevLine.equals("")) 60 && (line.startsWith("http://") || line.startsWith("https://"))); 61 } 62 63 public void debugPageDump(StringBuilder pageDump) { 64 65 // START DEBUG 66 debug("__________________________________________"); 67 debug("@@@ Found page entry: "); 68 debug("__________________________________________"); 69 debug(pageDump.toString()); 70 debug("------------------------------------------"); 71 // END DEBUG 72 } 55 73 56 74 public NutchTextDumpProcessor(MaoriTextDetector maoriTxtDetector, String siteID, File txtDumpFile) { … … 62 80 63 81 String line = null; 64 StringBuilder pageDump = n ew StringBuilder();82 StringBuilder pageDump = null; 65 83 try ( 66 84 BufferedReader reader = new BufferedReader(new FileReader(txtDumpFile)); … … 68 86 69 87 boolean readingText = false; 88 String prevLine = null; 70 89 71 90 while((line = reader.readLine()) != null) { // readLine removes newline separator … … 76 95 // markers of a page's text, though. 77 96 78 if(!readingText && line.equals("")) { 97 if(isStartOfNewWebPageRecord(prevLine, line)) { 98 99 if(pageDump != null) { // should also be the case then: if(prevLine != null) 100 // finish old pageDump and begin new one 101 102 //debugPageDump(pageDump); 103 104 TextDumpPage page = new TextDumpPage(siteID, pageDump.toString()); 105 // parses the fields and body text of a webpage in nutch's txt dump of entire site 106 //page.parseFields(); 107 //page.getText(); 108 pages.add(page); 109 pageDump = null; 110 111 } 112 113 // begin new webpage dump 114 pageDump = new StringBuilder(); 79 115 pageDump.append(line); 80 116 pageDump.append("\n"); 81 82 117 83 // START DEBUG84 debug("__________________________________________");85 debug("@@@ Found page entry: ");86 debug("__________________________________________");87 debug(pageDump.toString());88 debug("------------------------------------------");89 // END DEBUG90 91 92 TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());93 // parses the fields and body text of a webpage in nutch's txt dump of entire site94 //page.parseFields();95 //page.getText();96 pages.add(page);97 pageDump = null;98 99 pageDump = new StringBuilder();100 118 } 101 else if (!line.equals("")) { // empty line 102 if(line.equals(TextDumpPage.TEXT_START_MARKER)) { 103 readingText = true; 104 } 105 if(line.equals(TextDumpPage.TEXT_END_MARKER)) { 106 readingText = false; 107 } 119 else if(!line.equals("")) { 108 120 pageDump.append(line); 109 121 pageDump.append("\n"); 122 110 123 } 111 124 // can throw away any newlines between text start and end markers. 125 126 prevLine = line; 112 127 } 128 129 // process final webpage record: 130 //debugPageDump(pageDump); 131 132 TextDumpPage page = new TextDumpPage(siteID, pageDump.toString()); 133 pages.add(page); 134 pageDump = null; 113 135 114 136 } catch (IOException ioe) { … … 190 212 File[] sites = sitesDir.listFiles(); 191 213 for(File siteDir : sites) { // e.g. 00001 192 // look for dump.txt 193 File txtDumpFile = new File(siteDir, "dump.txt"); 194 if(!txtDumpFile.exists()) { 195 error("Text dump file " + txtDumpFile + " did not exist"); 196 continue; 197 } 198 199 else { 200 String siteID = siteDir.getName(); 201 debug("Found siteID: " + siteID); 202 NutchTextDumpProcessor nutchTxtDump = new NutchTextDumpProcessor(mriTxtDetector, siteID, txtDumpFile); 214 if(siteDir.isDirectory()) { 215 // look for dump.txt 216 File txtDumpFile = new File(siteDir, "dump.txt"); 217 if(!txtDumpFile.exists()) { 218 error("Text dump file " + txtDumpFile + " did not exist"); 219 continue; 220 } 221 222 else { 223 String siteID = siteDir.getName(); 224 debug("Found siteID: " + siteID); 225 NutchTextDumpProcessor nutchTxtDump = new NutchTextDumpProcessor(mriTxtDetector, siteID, txtDumpFile); 226 } 203 227 } 204 228 -
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/TextDumpPage.java
r33579 r33580 28 28 int endIndex = line.indexOf("key:"); 29 29 30 //String pageURL = line.substring(0, endIndex);31 String pageURL = (endIndex == -1) ? line : line.substring(0, endIndex);30 String pageURL = line.substring(0, endIndex); 31 //String pageURL = (endIndex == -1) ? line : line.substring(0, endIndex); 32 32 33 33 34 34 tuples.put("pageURL", pageURL.trim()); 35 35 36 if(endIndex != -1) {37 38 39 } else {40 41 }42 36 //if(endIndex != -1) { 37 String key = line.substring(endIndex); 38 tuples.put("key", key.trim()); 39 //} else { 40 //debug("@@@@ no key for pageURL: " + pageURL); 41 //} 42 /* 43 43 if(pageURL.contains(TEXT_END_MARKER)) { 44 44 debug("@@@@ TEXT_END_MARKER assigned to pageURL for page: "); … … 47 47 debug("+++++++++"); 48 48 } 49 */ 49 50 50 51 boolean readingPageText = false; … … 53 54 // continue reading all other tuples for this page, if any 54 55 while((line = reader.readLine()) != null) { 56 line = line.trim(); 57 58 // check if we're dealing with metadata or start/end of page's text body 59 // or actual text body 55 60 56 if(!readingPageText) { 57 // check if we're dealing with metadata or start/end of pagetext 58 endIndex = line.indexOf(":"); 59 if(endIndex != -1) { // dealing with the rest of the page dump's metadata 60 String k = line.substring(0, endIndex); 61 String v = line.substring(endIndex+1); 62 tuples.put(k.trim(), v.trim()); 63 } 64 65 else if(line.equals(TEXT_START_MARKER)) { // dealing with the page body text 66 pageText = new StringBuilder(); 67 readingPageText = true; 68 } 61 if(line.equals(TEXT_START_MARKER)) { // dealing with the page body text 62 pageText = new StringBuilder(); 63 readingPageText = true; 69 64 } 70 71 else { // we're reading in the page text 72 73 if(line.equals(TEXT_END_MARKER)) { 74 // finished with a page body 75 // remove any FINAL artificial newline we introduced 76 tuples.put("pageText", pageText.toString().trim()); 77 readingPageText = false; 78 pageText = null; 79 } 80 else { 65 else if(line.equals(TEXT_END_MARKER)) { 66 // finished with a page body 67 // Remove any FINAL artificial newline we introduced to a page's body text 68 tuples.put("pageText", pageText.toString().trim()); 69 readingPageText = false; 70 pageText = null; 71 } 72 else { 73 if(readingPageText) { // So we're reading in the page text 81 74 pageText.append(line); 82 75 pageText.append("\n"); // there are no newlines within pageText 83 76 // but if there were newlines, add them back here as readLine() removes them 84 } 77 } 78 else { // dealing with the rest of the page dump's metadata 79 endIndex = line.indexOf(":"); 80 if(endIndex != -1) { 81 String k = line.substring(0, endIndex); 82 String v = line.substring(endIndex+1); 83 tuples.put(k.trim(), v.trim()); 84 } else { 85 error("No meta key for meta: " + line); 86 } 87 } 88 } 85 89 86 }87 90 } 88 91 … … 97 100 } 98 101 99 /*102 100 103 // START DEBUG 101 104 debug("__________________________________________"); … … 107 110 debug("__________________________________________"); 108 111 // END DEBUG 109 */112 110 113 } 111 114
Note:
See TracChangeset
for help on using the changeset viewer.