Context Navigation

← Previous Changeset
Next Changeset →

Changeset 33601

Timestamp:

2019-10-23T23:22:14+13:00 (4 years ago)

Author:

ak19

Message:

Creates the 2nd csv file, with info about webpages. At present stores irrelevant records too, such as webpages with no content (no text body) or web pages that were neither detected as MRI overall nor had any MRI sentences. In the subsequent commit, these web pages will be discarded.

File:

: 1 edited

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java (modified) (10 diffs)

Legend:

: Unmodified
: Added
: Removed

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java

-              r33600
+              r33601
     /** A NutchTextDumpProcessor processes the dump.txt for one site */
+    public NutchTextDumpProcessor(MaoriTextDetector maoriTxtDetector, String siteID,
+    public NutchTextDumpProcessor(CSVPrinter webpagesCSVPrinter,
+                  MaoriTextDetector maoriTxtDetector, String siteID,
                   File txtDumpFile, long lastModified, boolean siteCrawlUnfinished)
+    throws IOException
+    {
     // increment static counter of sites processed by a NutchTextDumpProcessor instance
 …
+    }
     prepareSiteStats();
+    prepareSiteStats(webpagesCSVPrinter);
+    }
 …
+    }
     private void prepareSiteStats() {
+    private void prepareSiteStats(CSVPrinter webpageCSVPrinter) throws IOException {
     pagesInMRI = new ArrayList<MRIWebPageStats>();
     pagesContainingMRI = new ArrayList<MRIWebPageStats>();
 …
         String text = page.getPageText();
+        String url = page.getPageURL();
         if(text.equals("")) {
         page.addMRILanguageStatus(false);
+        // write to webpage CSV for all pages
+        // not just those webpages that are overall MRI, or those that contain MRI sentences
+        // CSV column headings:
+        // pageID, siteID, URL, isMRI, numSentences, numSentencesInMRI
+        webpageCSVPrinter.printRecord(i, this.siteID, url, "false", 0, 0);
         continue;
+        }
 …
         // are in te reo.
         ArrayList<String> mriSentences = maoriTxtDetector.getAllSentencesInMaori(text);
         // first element is always total num sentences
+        // first element of ArrayList returned is always total num sentences on page
         // remaining elements are the actual sentences that were detected as being MÄori
         int totalSentences = Integer.parseInt(mriSentences.get(0));
+        int numSentencesInMRI = mriSentences.size() - 1;
+        int numSentencesInMRI = mriSentences.size() - 1;
         // Add page to list of MRI pages if the page's body text overall was detected
 …
         // were detected as being in MRI
         if(isMRI || numSentencesInMRI >= 1) {
             String url = page.getPageURL();
             MRIWebPageStats MRIpageStats = new MRIWebPageStats(this.siteID, url, i, isMRI,
                            totalSentences, numSentencesInMRI);
 …
+            }
+        }
+        }
+        }
+        // pageID, siteID, URL, isMRI, numSentences, numSentencesInMRI
+        webpageCSVPrinter.printRecord(i, this.siteID, url, isMRI, totalSentences, numSentencesInMRI);
+        }
+    }
+    }
 …
     File websitesCSVFile = new File(sitesDir, "websites.csv");
+    try (CSVPrinter websitesCSVPrinter = new CSVPrinter(new FileWriter(websitesCSVFile), CSVFormat.DEFAULT)) {
+    File webpagesCSVFile = new File(sitesDir, "webpages.csv");
+    try (
+         CSVPrinter websitesCSVPrinter = new CSVPrinter(new FileWriter(websitesCSVFile), CSVFormat.DEFAULT);
+         CSVPrinter webpagesCSVPrinter = new CSVPrinter(new FileWriter(webpagesCSVFile), CSVFormat.DEFAULT);
+         ) {
         // print out the column headers for the websites csv file
 …
                    "totalPagesInSite", "numPagesInMRI", "numOtherPagesContainingMRI",
                    "nutchCrawlTimestamp", "crawlUnfinished", "redoCrawl");
+        webpagesCSVPrinter.printRecord("pageID", "siteID", "URL", "isMRI",
+                       "numSentences", "numSentencesInMRI");
         MaoriTextDetector mriTxtDetector = new MaoriTextDetector(true); // true: run silent
 …
             debug("Found siteID: " + siteID);
             NutchTextDumpProcessor nutchTxtDump
                 = new NutchTextDumpProcessor(mriTxtDetector, siteID, txtDumpFile, lastModified, UNFINISHED_FILE.exists());
+                = new NutchTextDumpProcessor(webpagesCSVPrinter, mriTxtDetector, siteID, txtDumpFile, lastModified, UNFINISHED_FILE.exists());
             // now it's parsed all the web pages in the site's text dump

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 33601

Legend:

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java

Download in other formats: