Context Navigation

← Previous Change
Next Change →

Changeset 33602 for gs3-extensions

Timestamp:

2019-10-23T23:49:34+13:00 (5 years ago)

Author:

ak19

Message:

The final csv file, mri-sentences.csv, is now written out. 2. Only pages that are overall in MRI or contain any MRI sentences get entries in the webpages csv file now. 3. Corrections to ID columns written to the webpages and websites csv files. 4. Some cleanup of unused code.

Location:

gs3-extensions/maori-lang-detection/src/org/greenstone/atea

Files:

: 2 edited

MRIWebPageStats.java (modified) (2 diffs)
NutchTextDumpProcessor.java (modified) (10 diffs)

Legend:

: Unmodified
: Added
: Removed

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/MRIWebPageStats.java

-              r33600
+              r33602
     /** for converting to csv */
+    /*
+      Unused.
     public String[] toCSV() {
     String[] csvRecord = { Integer.toString(pageID),
 …
     return csvRecord;
+    }
+    */
+}

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java

-              r33601
+              r33602
      */
     static private int SITE_COUNTER = 0;
+    static private long WEBPAGE_COUNTER = 0;
+    static private long MRI_SENTENCE_COUNTER = 0;
     private final MaoriTextDetector maoriTxtDetector;
 …
     /** A NutchTextDumpProcessor processes the dump.txt for one site */
     public NutchTextDumpProcessor(CSVPrinter webpagesCSVPrinter,
+    public NutchTextDumpProcessor(CSVPrinter webpagesCSVPrinter, CSVPrinter mriSentencesCSVPrinter,
                   MaoriTextDetector maoriTxtDetector, String siteID,
                   File txtDumpFile, long lastModified, boolean siteCrawlUnfinished)
 …
+    }
     prepareSiteStats(webpagesCSVPrinter);
+    prepareSiteStats(webpagesCSVPrinter, mriSentencesCSVPrinter);
+    }
 …
+    }
     private void prepareSiteStats(CSVPrinter webpageCSVPrinter) throws IOException {
+    private void prepareSiteStats(CSVPrinter webpageCSVPrinter, CSVPrinter mriSentencesCSVPrinter) throws IOException {
     pagesInMRI = new ArrayList<MRIWebPageStats>();
     pagesContainingMRI = new ArrayList<MRIWebPageStats>();
 …
         String text = page.getPageText();
-        String url = page.getPageURL();
         if(text.equals("")) {
         page.addMRILanguageStatus(false);
-        // write to webpage CSV for all pages
-        // not just those webpages that are overall MRI, or those that contain MRI sentences
-        // CSV column headings:
-        // pageID, siteID, URL, isMRI, numSentences, numSentencesInMRI
-        webpageCSVPrinter.printRecord(i, this.siteID, url, "false", 0, 0);
         continue;
+        }
 …
         // were detected as being in MRI
         if(isMRI || numSentencesInMRI >= 1) {
+            String url = page.getPageURL();
             MRIWebPageStats MRIpageStats = new MRIWebPageStats(this.siteID, url, i, isMRI,
                            totalSentences, numSentencesInMRI);
 …
+            }
+        }
+        // pageID, siteID, URL, isMRI, numSentences, numSentencesInMRI
+        webpageCSVPrinter.printRecord(i, this.siteID, url, isMRI, totalSentences, numSentencesInMRI);
+            // Only write to webpages csv file for those pages that had any MRI
+            // language content.
+            // column headers:
+            //    pageID, siteID, URL, isMRI, numSentences, numSentencesInMRI
+            //int pageID = i; // not primary key by itself,
+                    // must be combined with siteID to form primary key
+            webpageCSVPrinter.printRecord(WEBPAGE_COUNTER++,
+                          SITE_COUNTER, /* alternative: this.siteID */
+                          url, isMRI, totalSentences, numSentencesInMRI);
+            // Write the sentences that are in te reo into the mri-sentences CSV file
+            // whether from webpages that are MRI overall or only those that containing
+            // any sentences in MRI
+            for (int j = 1; j < mriSentences.size(); j++) { // 1st element not a sentence
+            //int sentenceID = j; // combine with siteID and pageID to form primary key
+            String mriSentence = mriSentences.get(j);
+            // sentenceID, pageID, sentence
+            //mriSentencesCSVPrinter.printRecord(sentenceID, pageID, mriSentence);
+            mriSentencesCSVPrinter.printRecord(MRI_SENTENCE_COUNTER++, WEBPAGE_COUNTER, mriSentence);
+            }
+        }
+        }
+    }
 …
+    }
-    /*
-    public void sitePagesToCSV(CSVPrinter webpageCSVPrinter, ArrayList<String> mriSentences)
-    throws IOException
+    {
-    int totalSentences
-    for(int i = 0; i < )
-        printer.printRecord();
-    } catch (IOException ex) {
-        ex.printStackTrace();
+    }
+    }
-    */
-    /*
-    public void xsitePagesToCSV(File webpageCSVFile, ArrayList<String> mriSentences) {
-    // https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVPrinter.html
-    //CSVFormat csvFormat = CSVFormat.DEFAULT.
-    //    withHeader("pageID", "siteID", "URL", "isMRI", "numSentences", "numSentencesInMRI");
-    try (CSVPrinter printer = new CSVPrinter(new FileWriter(webpageCSVFile), CSVFormat.DEFAULT)) {
-        // header
-        //printer.printRecord("pageID", "siteID", "URL", "isMRI", "numSentences", "numSentencesInMRI");
-        // skip first one
-        for()
-        printer.printRecord();
-    } catch (IOException ex) {
-        ex.printStackTrace();
+    }
+    }
-    */
     // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
 …
     File websitesCSVFile = new File(sitesDir, "websites.csv");
     File webpagesCSVFile = new File(sitesDir, "webpages.csv");
+    File mriSentencesCSVFile = new File(sitesDir, "mri-sentences.csv");
     try (
          CSVPrinter websitesCSVPrinter = new CSVPrinter(new FileWriter(websitesCSVFile), CSVFormat.DEFAULT);
          CSVPrinter webpagesCSVPrinter = new CSVPrinter(new FileWriter(webpagesCSVFile), CSVFormat.DEFAULT);
+         CSVPrinter mriSentencesCSVPrinter = new CSVPrinter(new FileWriter(mriSentencesCSVFile), CSVFormat.DEFAULT);
          ) {
         // print out the column headers for the websites csv file
+        websitesCSVPrinter.printRecord("ID", "siteID", "domainURL",
+                   "totalPagesInSite", "numPagesInMRI", "numOtherPagesContainingMRI",
+                   "nutchCrawlTimestamp", "crawlUnfinished", "redoCrawl");
+        webpagesCSVPrinter.printRecord("pageID", "siteID", "URL", "isMRI",
+        // https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVPrinter.html
+        websitesCSVPrinter.printRecord("ID" /*websiteID*/, "siteID"/* site folder name*/,
+           "domainURL","totalPagesInSite", "numPagesInMRI", "numOtherPagesContainingMRI",
+           "nutchCrawlTimestamp", "crawlUnfinished", "redoCrawl");
+        webpagesCSVPrinter.printRecord("webpageID", "websiteID", "URL", "isMRI",
                        "numSentences", "numSentencesInMRI");
+        mriSentencesCSVPrinter.printRecord("sentenceID", "webpageID", "sentence");
         MaoriTextDetector mriTxtDetector = new MaoriTextDetector(true); // true: run silent
 …
             long lastModified = siteDir.lastModified();
             debug("Found siteID: " + siteID);
+            NutchTextDumpProcessor nutchTxtDump
+                = new NutchTextDumpProcessor(webpagesCSVPrinter, mriTxtDetector, siteID, txtDumpFile, lastModified, UNFINISHED_FILE.exists());
+            NutchTextDumpProcessor nutchTxtDump = new NutchTextDumpProcessor(
+                 webpagesCSVPrinter, mriSentencesCSVPrinter, mriTxtDetector,
+                 siteID, txtDumpFile, lastModified, UNFINISHED_FILE.exists());
             // now it's parsed all the web pages in the site's text dump

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 33602 for gs3-extensions

Legend:

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/MRIWebPageStats.java

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java

Download in other formats: