Changeset 33579 for gs3-extensions
- Timestamp:
- 2019-10-17T21:05:21+13:00 (5 years ago)
- Location:
- gs3-extensions/maori-lang-detection/src/org/greenstone/atea
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java
r33578 r33579 36 36 * TO COMPILE: 37 37 * maori-lang-detection/src$ 38 * javac -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/ *" org/greenstone/atea/NutchTextDumpProcessor.java38 * javac -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor.java 39 39 * 40 40 * TO RUN: 41 41 * maori-lang-detection/src$ 42 * java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor 42 * java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor ../crawled 43 43 * 44 44 */ … … 67 67 ) { 68 68 69 boolean readingText = false; 70 69 71 while((line = reader.readLine()) != null) { // readLine removes newline separator 70 72 line = line.trim(); 71 // an empty line marks the end of a page in nutch's text dump of a site 72 if(!line.equals("")) { 73 // iff outside of a page's body text, then an empty line marks the end of a page 74 // in nutch's text dump of a site. 75 // But note, there can be an empty line (or more?) between the start and end 76 // markers of a page's text, though. 77 78 if(!readingText && line.equals("")) { 73 79 pageDump.append(line); 74 80 pageDump.append("\n"); 75 } else { 81 82 83 // START DEBUG 84 debug("__________________________________________"); 85 debug("@@@ Found page entry: "); 86 debug("__________________________________________"); 87 debug(pageDump.toString()); 88 debug("------------------------------------------"); 89 // END DEBUG 90 91 76 92 TextDumpPage page = new TextDumpPage(siteID, pageDump.toString()); 77 93 // parses the fields and body text of a webpage in nutch's txt dump of entire site … … 80 96 pages.add(page); 81 97 pageDump = null; 98 82 99 pageDump = new StringBuilder(); 83 100 } 101 else if (!line.equals("")) { // empty line 102 if(line.equals(TextDumpPage.TEXT_START_MARKER)) { 103 readingText = true; 104 } 105 if(line.equals(TextDumpPage.TEXT_END_MARKER)) { 106 readingText = false; 107 } 108 pageDump.append(line); 109 pageDump.append("\n"); 110 } 111 // can throw away any newlines between text start and end markers. 84 112 } 85 113 … … 171 199 else { 172 200 String siteID = siteDir.getName(); 173 NutchTextDumpProcessor nutchTxtDump = new NutchTextDumpProcessor(mriTxtDetector, siteID, txtDumpFile);174 201 debug("Found siteID: " + siteID); 202 NutchTextDumpProcessor nutchTxtDump = new NutchTextDumpProcessor(mriTxtDetector, siteID, txtDumpFile); 175 203 } 176 204 -
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/TextDumpPage.java
r33578 r33579 27 27 // first line always has a "key:" somewhere after the pageURL 28 28 int endIndex = line.indexOf("key:"); 29 String pageURL = line.substring(endIndex); 29 30 //String pageURL = line.substring(0, endIndex); 31 String pageURL = (endIndex == -1) ? line : line.substring(0, endIndex); 32 30 33 31 34 tuples.put("pageURL", pageURL.trim()); 32 35 33 String key = line.substring(endIndex); 34 tuples.put("key", key.trim()); 36 if(endIndex != -1) { 37 String key = line.substring(endIndex); 38 tuples.put("key", key.trim()); 39 } else { 40 debug("@@@@ no key for pageURL: " + pageURL); 41 } 42 43 if(pageURL.contains(TEXT_END_MARKER)) { 44 debug("@@@@ TEXT_END_MARKER assigned to pageURL for page: "); 45 debug("+++++++++"); 46 debug(unparsedPageDump); 47 debug("+++++++++"); 48 } 35 49 36 50 boolean readingPageText = false; … … 77 91 tuples.put("pageText", ""); 78 92 } 93 79 94 80 95 } catch (IOException ioe) { 81 96 error("@@@@@@@@@ Error reading in txtdump of a page.", ioe); 82 97 } 98 99 /* 100 // START DEBUG 101 debug("__________________________________________"); 102 for(Map.Entry<String, String> entry : tuples.entrySet()) { 103 String key = entry.getKey(); 104 String value = entry.getValue(); 105 debug(key + " - " + value); 106 } 107 debug("__________________________________________"); 108 // END DEBUG 109 */ 83 110 } 84 111
Note:
See TracChangeset
for help on using the changeset viewer.