Context Navigation

← Previous Change
Next Change →

Changeset 33580 for gs3-extensions

Timestamp:

2019-10-17T21:44:46+13:00 (5 years ago)

Author:

ak19

Message:

Finally fixed the thus-far identified bugs when parsing dump.txt.

Location:

gs3-extensions/maori-lang-detection/src/org/greenstone/atea

Files:

: 2 edited

NutchTextDumpProcessor.java (modified) (5 diffs)
TextDumpPage.java (modified) (5 diffs)

Legend:

: Unmodified
: Added
: Removed

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java

-              r33579
+              r33580
     /** keep a list to store the text of each page */
     private ArrayList<TextDumpPage> pages;
+    private boolean isStartOfNewWebPageRecord(String prevLine, String line) {
+    // The start of a new web page's record in nutch's text dump of an entire site
+    // is denoted by a newline followed by a URL (protocol)
+    // or the very start of the file with a URL (protocol)
+    return ((prevLine == null || prevLine.equals(""))
+        && (line.startsWith("http://") || line.startsWith("https://")));
+    }
+    public void debugPageDump(StringBuilder pageDump) {
+    // START DEBUG
+    debug("__________________________________________");
+    debug("@@@ Found page entry: ");
+    debug("__________________________________________");
+    debug(pageDump.toString());
+    debug("------------------------------------------");
+    // END DEBUG
+    }
     public NutchTextDumpProcessor(MaoriTextDetector maoriTxtDetector, String siteID, File txtDumpFile) {
 …
     String line = null;
     StringBuilder pageDump = new StringBuilder();
+    StringBuilder pageDump = null;
     try (
          BufferedReader reader = new BufferedReader(new FileReader(txtDumpFile));
 …
         boolean readingText = false;
+        String prevLine = null;
         while((line = reader.readLine()) != null) { // readLine removes newline separator
 …
         // markers of a page's text, though.
+        if(!readingText && line.equals("")) {
+        if(isStartOfNewWebPageRecord(prevLine, line)) {
+            if(pageDump != null) { // should also be the case then: if(prevLine != null)
+            // finish old pageDump and begin new one
+            //debugPageDump(pageDump);
+            TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
+            // parses the fields and body text of a webpage in nutch's txt dump of entire site
+            //page.parseFields();
+            //page.getText();
+            pages.add(page);
+            pageDump = null;
+            }
+            // begin new webpage dump
+            pageDump = new StringBuilder();
             pageDump.append(line);
             pageDump.append("\n");
-            // START DEBUG
-            debug("__________________________________________");
-            debug("@@@ Found page entry: ");
-            debug("__________________________________________");
-            debug(pageDump.toString());
-            debug("------------------------------------------");
-            // END DEBUG
-            TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
-            // parses the fields and body text of a webpage in nutch's txt dump of entire site
-            //page.parseFields();
-            //page.getText();
-            pages.add(page);
-            pageDump = null;
-            pageDump = new StringBuilder();
+        }
+        else if (!line.equals("")) { // empty line
+            if(line.equals(TextDumpPage.TEXT_START_MARKER)) {
+            readingText = true;
+            }
+            if(line.equals(TextDumpPage.TEXT_END_MARKER)) {
+            readingText = false;
+            }
+        else if(!line.equals("")) {
             pageDump.append(line);
             pageDump.append("\n");
+        }
         // can throw away any newlines between text start and end markers.
+        prevLine = line;
+        }
+        // process final webpage record:
+        //debugPageDump(pageDump);
+        TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
+        pages.add(page);
+        pageDump = null;
     } catch (IOException ioe) {
 …
         File[] sites = sitesDir.listFiles();
         for(File siteDir : sites) { // e.g. 00001
+        // look for dump.txt
+        File txtDumpFile = new File(siteDir, "dump.txt");
+        if(!txtDumpFile.exists()) {
+            error("Text dump file " + txtDumpFile + " did not exist");
+            continue;
+        }
+        else {
+            String siteID = siteDir.getName();
+            debug("Found siteID: " + siteID);
+            NutchTextDumpProcessor nutchTxtDump = new NutchTextDumpProcessor(mriTxtDetector, siteID, txtDumpFile);
+        if(siteDir.isDirectory()) {
+            // look for dump.txt
+            File txtDumpFile = new File(siteDir, "dump.txt");
+            if(!txtDumpFile.exists()) {
+            error("Text dump file " + txtDumpFile + " did not exist");
+            continue;
+            }
+            else {
+            String siteID = siteDir.getName();
+            debug("Found siteID: " + siteID);
+            NutchTextDumpProcessor nutchTxtDump = new NutchTextDumpProcessor(mriTxtDetector, siteID, txtDumpFile);
+            }
+        }

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/TextDumpPage.java

-              r33579
+              r33580
         int endIndex = line.indexOf("key:");
         //String pageURL = line.substring(0, endIndex);
         String pageURL = (endIndex == -1) ? line : line.substring(0, endIndex);
+        String pageURL = line.substring(0, endIndex);
+        //String pageURL = (endIndex == -1) ? line : line.substring(0, endIndex);
         tuples.put("pageURL", pageURL.trim());
         if(endIndex != -1) {
         String key = line.substring(endIndex);
         tuples.put("key", key.trim());
         } else {
         debug("@@@@ no key for pageURL: " + pageURL);
+        }
+        //if(endIndex != -1) {
+        String key = line.substring(endIndex);
+        tuples.put("key", key.trim());
+        //} else {
+        //debug("@@@@ no key for pageURL: " + pageURL);
+        //}
+        /*
         if(pageURL.contains(TEXT_END_MARKER)) {
         debug("@@@@ TEXT_END_MARKER assigned to pageURL for page: ");
 …
         debug("+++++++++");
+        }
+        */
         boolean readingPageText = false;
 …
         // continue reading all other tuples for this page, if any
         while((line = reader.readLine()) != null) {
+        line = line.trim();
+        // check if we're dealing with metadata or start/end of page's text body
+        // or actual text body
+        if(!readingPageText) {
+            // check if we're dealing with metadata or start/end of pagetext
+            endIndex = line.indexOf(":");
+            if(endIndex != -1) { // dealing with the rest of the page dump's metadata
+            String k = line.substring(0, endIndex);
+            String v = line.substring(endIndex+1);
+            tuples.put(k.trim(), v.trim());
+            }
+            else if(line.equals(TEXT_START_MARKER)) { // dealing with the page body text
+            pageText = new StringBuilder();
+            readingPageText = true;
+            }
+        if(line.equals(TEXT_START_MARKER)) { // dealing with the page body text
+            pageText = new StringBuilder();
+            readingPageText = true;
+        }
+        else { // we're reading in the page text
+            if(line.equals(TEXT_END_MARKER)) {
+            // finished with a page body
+            // remove any FINAL artificial newline we introduced
+            tuples.put("pageText", pageText.toString().trim());
+            readingPageText = false;
+            pageText = null;
+            }
+            else {
+        else if(line.equals(TEXT_END_MARKER)) {
+            // finished with a page body
+            // Remove any FINAL artificial newline we introduced to a page's body text
+            tuples.put("pageText", pageText.toString().trim());
+            readingPageText = false;
+            pageText = null;
+        }
+        else {
+            if(readingPageText) { // So we're reading in the page text
             pageText.append(line);
             pageText.append("\n"); // there are no newlines within pageText
             // but if there were newlines, add them back here as readLine() removes them
+            }
+            }
+            else { // dealing with the rest of the page dump's metadata
+            endIndex = line.indexOf(":");
+            if(endIndex != -1) {
+                String k = line.substring(0, endIndex);
+                String v = line.substring(endIndex+1);
+                tuples.put(k.trim(), v.trim());
+            } else {
+                error("No meta key for meta: " + line);
+            }
+            }
+        }
+        }
+        }
 …
+    }
     /*
     // START DEBUG
     debug("__________________________________________");
 …
     debug("__________________________________________");
     // END DEBUG
     */
+    }

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 33580 for gs3-extensions

Legend:

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/TextDumpPage.java

Download in other formats: