Changeset 33579 for gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java
- Timestamp:
- 2019-10-17T21:05:21+13:00 (5 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java
r33578 r33579 36 36 * TO COMPILE: 37 37 * maori-lang-detection/src$ 38 * javac -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/ *" org/greenstone/atea/NutchTextDumpProcessor.java38 * javac -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor.java 39 39 * 40 40 * TO RUN: 41 41 * maori-lang-detection/src$ 42 * java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor 42 * java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor ../crawled 43 43 * 44 44 */ … … 67 67 ) { 68 68 69 boolean readingText = false; 70 69 71 while((line = reader.readLine()) != null) { // readLine removes newline separator 70 72 line = line.trim(); 71 // an empty line marks the end of a page in nutch's text dump of a site 72 if(!line.equals("")) { 73 // iff outside of a page's body text, then an empty line marks the end of a page 74 // in nutch's text dump of a site. 75 // But note, there can be an empty line (or more?) between the start and end 76 // markers of a page's text, though. 77 78 if(!readingText && line.equals("")) { 73 79 pageDump.append(line); 74 80 pageDump.append("\n"); 75 } else { 81 82 83 // START DEBUG 84 debug("__________________________________________"); 85 debug("@@@ Found page entry: "); 86 debug("__________________________________________"); 87 debug(pageDump.toString()); 88 debug("------------------------------------------"); 89 // END DEBUG 90 91 76 92 TextDumpPage page = new TextDumpPage(siteID, pageDump.toString()); 77 93 // parses the fields and body text of a webpage in nutch's txt dump of entire site … … 80 96 pages.add(page); 81 97 pageDump = null; 98 82 99 pageDump = new StringBuilder(); 83 100 } 101 else if (!line.equals("")) { // empty line 102 if(line.equals(TextDumpPage.TEXT_START_MARKER)) { 103 readingText = true; 104 } 105 if(line.equals(TextDumpPage.TEXT_END_MARKER)) { 106 readingText = false; 107 } 108 pageDump.append(line); 109 pageDump.append("\n"); 110 } 111 // can throw away any newlines between text start and end markers. 84 112 } 85 113 … … 171 199 else { 172 200 String siteID = siteDir.getName(); 173 NutchTextDumpProcessor nutchTxtDump = new NutchTextDumpProcessor(mriTxtDetector, siteID, txtDumpFile);174 201 debug("Found siteID: " + siteID); 202 NutchTextDumpProcessor nutchTxtDump = new NutchTextDumpProcessor(mriTxtDetector, siteID, txtDumpFile); 175 203 } 176 204
Note:
See TracChangeset
for help on using the changeset viewer.