Context Navigation

← Previous Changeset
Next Changeset →

Changeset 33600

Timestamp:

2019-10-23T23:05:38+13:00 (5 years ago)

Author:

ak19

Message:

Work in progress of writing out CSV files. In future, may write the same info to MySQL DB instead. This commit only does the first of 3 tables, the websites csv file.

Location:

gs3-extensions/maori-lang-detection/src/org/greenstone/atea

Files:

: 2 edited

MRIWebPageStats.java (modified) (1 diff)
NutchTextDumpProcessor.java (modified) (14 diffs)

Legend:

: Unmodified
: Added
: Removed

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/MRIWebPageStats.java

-              r33587
+              r33600
     return str.toString();
+    }
+    /** for converting to csv */
+    public String[] toCSV() {
+    String[] csvRecord = { Integer.toString(pageID),
+                   siteID, // foreign key
+                   URL,
+                   Boolean.toString(isMRI),
+                   Integer.toString(numSentences),
+                   Integer.toString(numSentencesInMRI)
+    };
+    return csvRecord;
+    }
+}

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java

-              r33587
+              r33600
 import java.io.*;
 import java.lang.ArrayIndexOutOfBoundsException;
+import java.time.LocalDateTime;
 import java.util.ArrayList;
 import java.util.Arrays;
+import org.apache.commons.csv.*;
 import org.apache.log4j.Logger;
 /**
 …
  * TO RUN:
  *    maori-lang-detection/src$
  *       java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor ../crawled
+ *       java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor ../crawled-small
+ *
  * or:
  *       java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor ../crawled > ../crawled/bla.txt 2>&1
+ *       java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor ../crawled-small > ../crawled-small/bla.txt 2>&1
+ *
 */
 …
     static boolean DEBUG_MODE = true;
+    /** Counter for number of sites.
+     * Should be equal to number of times NutchTextDumpProcessor constructor
+     * is called: once per site.
+     */
+    static private int SITE_COUNTER = 0;
     private final MaoriTextDetector maoriTxtDetector;
+    public final String siteID; // is this necessary?
+    public final String siteID;
+    public final boolean siteCrawlUnfinished;
+    public final long siteCrawledTimestamp; /** When the crawl of the site terminated */
     private String domainOfSite;
 …
     /** list of pages in this site which were detected as being in MRI */
     private ArrayList<MRIWebPageStats> pagesInMRI;
+    /** list of pages in this site which were NOT detected as being in MRI but nevertheless
+    /**
+     * list of pages in this site which were NOT detected as being in MRI but nevertheless
      * contain one or more sentences in MRI
      */
 …
+    }
+    }
+    public NutchTextDumpProcessor(MaoriTextDetector maoriTxtDetector, String siteID, File txtDumpFile) {
+    /** A NutchTextDumpProcessor processes the dump.txt for one site */
+    public NutchTextDumpProcessor(MaoriTextDetector maoriTxtDetector, String siteID,
+                  File txtDumpFile, long lastModified, boolean siteCrawlUnfinished)
+    {
+    // increment static counter of sites processed by a NutchTextDumpProcessor instance
+    SITE_COUNTER++;
     // siteID is of the form %5d (e.g. 00020) and is just the name of a site folder
     this.siteID = siteID;
+    this.siteCrawlUnfinished = siteCrawlUnfinished;
+    this.siteCrawledTimestamp = lastModified;
     this.maoriTxtDetector = maoriTxtDetector;
     pages = new ArrayList<TextDumpPage>();
 …
     public int totalNumPages() {
     return pages.size();
+    }
+    }
+    public int getNumPagesInMRI() {
+    return pagesInMRI.size();
+    }
+    public int getNumPagesContainingMRI() {
+    return pagesContainingMRI.size();
+    }
     private void prepareSiteStats() {
     pagesInMRI = new ArrayList<MRIWebPageStats>();
 …
         // Even if the entire page is not found to be overall in MÄori,
         // let's sitll inspect the sentences of the page and count how many (if any)
+        // let's still inspect the sentences of the page and count how many (if any)
         // are in te reo.
         ArrayList<String> mriSentences = maoriTxtDetector.getAllSentencesInMaori(text);
 …
+        }
+        }
+    }
+    }
 …
+    public void writeSiteRecordToCSV(CSVPrinter websitesCSVPrinter) throws IOException {
+    // https://stackoverflow.com/questions/35183146/how-can-i-create-a-java-8-localdate-from-a-long-epoch-time-in-milliseconds
+    // LocalDateTime date =
+    //     LocalDateTime.ofInstant(Instant.ofEpochMilli(this.siteCrawledTimestamp), ZoneId.systemDefault());
+    // String crawlTimestamp =
+    //     date.format(DateTimeFormatter.ofPattern("yyyy-MM-dd")) + " " + date.format(DateTimeFormatter.ofPattern("HH:mm:ss"));
+    boolean redoCrawl = false;
+    int numPagesInMRI = pagesInMRI.size();
+    int numPagesContainingMRI = pagesContainingMRI.size();
+    if(this.siteCrawlUnfinished) {
+        // arbitrary decision, but need some indication that the MRI content was not close to one-off in the website
+        if(numPagesInMRI > 2 || numPagesContainingMRI > 2) {
+        redoCrawl = true;
+        }
+    }
+    // site.csv CSV file row:
+    // ID, siteID, domainURL, totalPagesInSite, numPagesInMRI, numPagesContainingMRI, crawlUnfinished, redoCrawl
+    websitesCSVPrinter.printRecord(SITE_COUNTER, this.siteID, this.domainOfSite,
+                       pages.size(), numPagesInMRI, numPagesContainingMRI,
+                   this.siteCrawledTimestamp, this.siteCrawlUnfinished, redoCrawl);
+    }
+    /*
+    public void sitePagesToCSV(CSVPrinter webpageCSVPrinter, ArrayList<String> mriSentences)
+    throws IOException
+    {
+    int totalSentences
+    for(int i = 0; i < )
+        printer.printRecord();
+    } catch (IOException ex) {
+        ex.printStackTrace();
+    }
+    }
+    */
+    /*
+    public void xsitePagesToCSV(File webpageCSVFile, ArrayList<String> mriSentences) {
+    // https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVPrinter.html
+    //CSVFormat csvFormat = CSVFormat.DEFAULT.
+    //    withHeader("pageID", "siteID", "URL", "isMRI", "numSentences", "numSentencesInMRI");
+    try (CSVPrinter printer = new CSVPrinter(new FileWriter(webpageCSVFile), CSVFormat.DEFAULT)) {
+        // header
+        //printer.printRecord("pageID", "siteID", "URL", "isMRI", "numSentences", "numSentencesInMRI");
+        // skip first one
+        for()
+        printer.printRecord();
+    } catch (IOException ex) {
+        ex.printStackTrace();
+    }
+    }
+    */
     // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
     public static void info(String msg) {
 …
     public static void printUsage() {
     info("Run this program as:");
     info("\tNutchTextDumpProcessor <path to 'sites' folder>");
+    info("\tNutchTextDumpProcessor <path to 'crawled' folder>");
+    }
 …
     NutchTextDumpProcessor.DEBUG_MODE = false;
+    try {
+    File websitesCSVFile = new File(sitesDir, "websites.csv");
+    try (CSVPrinter websitesCSVPrinter = new CSVPrinter(new FileWriter(websitesCSVFile), CSVFormat.DEFAULT)) {
+        // print out the column headers for the websites csv file
+        websitesCSVPrinter.printRecord("ID", "siteID", "domainURL",
+                   "totalPagesInSite", "numPagesInMRI", "numOtherPagesContainingMRI",
+                   "nutchCrawlTimestamp", "crawlUnfinished", "redoCrawl");
         MaoriTextDetector mriTxtDetector = new MaoriTextDetector(true); // true: run silent
         File[] sites = sitesDir.listFiles();
 …
             else {
+            File UNFINISHED_FILE = new File(siteDir, "UNFINISHED");
             String siteID = siteDir.getName();
+            long lastModified = siteDir.lastModified();
             debug("Found siteID: " + siteID);
+            NutchTextDumpProcessor nutchTxtDump = new NutchTextDumpProcessor(mriTxtDetector, siteID, txtDumpFile);
+            NutchTextDumpProcessor nutchTxtDump
+                = new NutchTextDumpProcessor(mriTxtDetector, siteID, txtDumpFile, lastModified, UNFINISHED_FILE.exists());
             // now it's parsed all the web pages in the site's text dump
 …
             nutchTxtDump.printSiteStats();
+            nutchTxtDump.writeSiteRecordToCSV(websitesCSVPrinter);
+            }
+        }
 …
     } catch(Exception e) {
         // can get an exception when instantiating CCWETProcessor instance
+        // or with CSV file
         error(e.getMessage(), e);
+    }

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 33600

Legend:

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/MRIWebPageStats.java

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java

Download in other formats: