Context Navigation

← Previous Change
Next Change →

NutchTextDumpProcessor.java

Timestamp:

2019-10-17T21:44:46+13:00 (5 years ago)

Author:

ak19

Message:

Finally fixed the thus-far identified bugs when parsing dump.txt.

File:

: 1 edited

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java (modified) (5 diffs)

Legend:

: Unmodified
: Added
: Removed

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java

-              r33579
+              r33580
     /** keep a list to store the text of each page */
     private ArrayList<TextDumpPage> pages;
+    private boolean isStartOfNewWebPageRecord(String prevLine, String line) {
+    // The start of a new web page's record in nutch's text dump of an entire site
+    // is denoted by a newline followed by a URL (protocol)
+    // or the very start of the file with a URL (protocol)
+    return ((prevLine == null || prevLine.equals(""))
+        && (line.startsWith("http://") || line.startsWith("https://")));
+    }
+    public void debugPageDump(StringBuilder pageDump) {
+    // START DEBUG
+    debug("__________________________________________");
+    debug("@@@ Found page entry: ");
+    debug("__________________________________________");
+    debug(pageDump.toString());
+    debug("------------------------------------------");
+    // END DEBUG
+    }
     public NutchTextDumpProcessor(MaoriTextDetector maoriTxtDetector, String siteID, File txtDumpFile) {
 …
     String line = null;
     StringBuilder pageDump = new StringBuilder();
+    StringBuilder pageDump = null;
     try (
          BufferedReader reader = new BufferedReader(new FileReader(txtDumpFile));
 …
         boolean readingText = false;
+        String prevLine = null;
         while((line = reader.readLine()) != null) { // readLine removes newline separator
 …
         // markers of a page's text, though.
+        if(!readingText && line.equals("")) {
+        if(isStartOfNewWebPageRecord(prevLine, line)) {
+            if(pageDump != null) { // should also be the case then: if(prevLine != null)
+            // finish old pageDump and begin new one
+            //debugPageDump(pageDump);
+            TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
+            // parses the fields and body text of a webpage in nutch's txt dump of entire site
+            //page.parseFields();
+            //page.getText();
+            pages.add(page);
+            pageDump = null;
+            }
+            // begin new webpage dump
+            pageDump = new StringBuilder();
             pageDump.append(line);
             pageDump.append("\n");
-            // START DEBUG
-            debug("__________________________________________");
-            debug("@@@ Found page entry: ");
-            debug("__________________________________________");
-            debug(pageDump.toString());
-            debug("------------------------------------------");
-            // END DEBUG
-            TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
-            // parses the fields and body text of a webpage in nutch's txt dump of entire site
-            //page.parseFields();
-            //page.getText();
-            pages.add(page);
-            pageDump = null;
-            pageDump = new StringBuilder();
+        }
+        else if (!line.equals("")) { // empty line
+            if(line.equals(TextDumpPage.TEXT_START_MARKER)) {
+            readingText = true;
+            }
+            if(line.equals(TextDumpPage.TEXT_END_MARKER)) {
+            readingText = false;
+            }
+        else if(!line.equals("")) {
             pageDump.append(line);
             pageDump.append("\n");
+        }
         // can throw away any newlines between text start and end markers.
+        prevLine = line;
+        }
+        // process final webpage record:
+        //debugPageDump(pageDump);
+        TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
+        pages.add(page);
+        pageDump = null;
     } catch (IOException ioe) {
 …
         File[] sites = sitesDir.listFiles();
         for(File siteDir : sites) { // e.g. 00001
+        // look for dump.txt
+        File txtDumpFile = new File(siteDir, "dump.txt");
+        if(!txtDumpFile.exists()) {
+            error("Text dump file " + txtDumpFile + " did not exist");
+            continue;
+        }
+        else {
+            String siteID = siteDir.getName();
+            debug("Found siteID: " + siteID);
+            NutchTextDumpProcessor nutchTxtDump = new NutchTextDumpProcessor(mriTxtDetector, siteID, txtDumpFile);
+        if(siteDir.isDirectory()) {
+            // look for dump.txt
+            File txtDumpFile = new File(siteDir, "dump.txt");
+            if(!txtDumpFile.exists()) {
+            error("Text dump file " + txtDumpFile + " did not exist");
+            continue;
+            }
+            else {
+            String siteID = siteDir.getName();
+            debug("Found siteID: " + siteID);
+            NutchTextDumpProcessor nutchTxtDump = new NutchTextDumpProcessor(mriTxtDetector, siteID, txtDumpFile);
+            }
+        }

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 33580 for gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java

Legend:

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java

Download in other formats: