Context Navigation

← Previous Changeset
Next Changeset →

Changeset 33633

Timestamp:

2019-11-08T19:43:39+13:00 (4 years ago)

Author:

ak19

Message:

TextLanguageDetector now has methods for collecting all sentences and all overlapping sentences. 2. Renaming NutchTextDumpProcessor.java to NutchTextDumpToCSV.java, since there a new class, NutchTextDumpToMongoDB.java that needs slightly different data structures.

Location:

gs3-extensions/maori-lang-detection/src/org/greenstone/atea

Files:

: 2 edited
: 1 moved

MongoDBAccess.java (modified) (2 diffs)
NutchTextDumpToCSV.java (moved) (moved from gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java ) (10 diffs)
TextLanguageDetector.java (modified) (2 diffs)

Legend:

: Unmodified
: Added
: Removed

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java

-              r33626
+              r33633
     private static Logger logger = Logger.getLogger(org.greenstone.atea.MongoDBAccess.class.getName());
     String HOST = "localhost";
     int PORT = 27017; // mongodb port
     String PROPS_FILENAME = "config.properties";
     String DB_NAME = "ateacrawldata";
+    static final String PROPS_FILENAME = "config.properties";
+    public static final String DB_NAME = "anupama"; //"ateacrawldata";
+    public static final String WEBPAGES_COLLECTION = "webpages";
+    public static final String WEBSITES_COLLECTION = "websites";
+    private String HOST = "localhost";
+    private int PORT = 27017; // mongodb port
     private String USERNAME;
+    private String PASSWORD;
+    private String PASSWORD;
     private MongoClient mongo = null;
     private MongoDatabase database = null;
     public MongoDBAccess() throws Exception {
 …
     logger.info("Credentials: "+ credential);
+    }
+    public void insertWebSiteInfo(int SITE_COUNTER, int siteID, String domainOfSite,
+           int numPages, int numPagesInMRI, int numPagesContainingMRI,
+                  /* TODO: String geoLocationCountryCode, boolean miURL */
+           String siteCrawledTimestamp, String siteCrawlUnfinished, boolean redoCrawl)
+    {
+    MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);
+    Document document = new Document("id", SITE_COUNTER)
+        .append("siteFolderName", siteID)
+        .append("domain", domainOfSite)
+        .append("totalPages", numPages)
+        .append("numPagesInMRI", numPagesInMRI)
+        .append("numPagesContainingMRI", numPagesContainingMRI)
+        .append("siteCrawledTimestamp", siteCrawledTimestamp)
+        .append("siteCrawlUnfinished", siteCrawlUnfinished)
+        .append("redoCrawl", redoCrawl);
+    collection.insertOne(document);
+    System.out.println("website info inserted successfully into " + WEBSITES_COLLECTION);
+    }
+    /*
+    public void insertDocument() {
+    MongoCollection<Document> collection = this.database.getCollection("sampleCollection");
+    public void insertWebPage(int WEBPAGE_COUNTER, int site_id, /* id of websites_collection*/
+                  String url, String charEncoding, String modTime, String fetchTime,
+                  boolean isMRI, int totalSentences, int numSentencesInMRI,
+                  ArrayList<SentenceInfo> singleSentences,
+                  ArrayList<SentenceInfo> overlappingSentences)
+    {
+    // load the webpages db 'table'
+    // in mongodb, the equivalent of db tables are called 'collections'
+    MongoCollection<Document> collection = this.database.getCollection(WEBPAGES_COLLECTION);
+    Document document = new Document("id", WEBPAGE_COUNTER)
+        .append("siteid", site_id)
+        .append("url", url)
+        .append("charEncoding", charEncoding)
+        .append("modTime", modTime)
+        .append("fetchTime", fetchTime)
+        .append("isMRI", isMRI)
+        .append("totalSentences", totalSentences)
+        .append("numSentencesInMRI", numSentencesInMRI);
+    document.put("singleSentences", singleSentences);
+    document.put("overlappingSentences", overlappingSentences);
+    collection.insertOne(document);
+    System.out.println("website info inserted successfully into " + WEBPAGES_COLLECTION);
+    }
     */
     // TODO:

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToCSV.java

-              r33623
+              r33633
  * TO COMPILE:
  *    maori-lang-detection/src$
  *       javac -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor.java
+ *       javac -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToCSV.java
+ *
  * TO RUN:
  *    maori-lang-detection/src$
  *       java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor ../crawled-small
+ *       java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToCSV ../crawled-small
+ *
  * or:
  *       java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor ../crawled-small > ../crawled-small/bla.txt 2>&1
+ *       java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToCSV ../crawled-small > ../crawled-small/bla.txt 2>&1
+ *
 */
 public class NutchTextDumpProcessor {
     static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpProcessor.class.getName());
+public class NutchTextDumpToCSV {
+    static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpToCSV.class.getName());
     static boolean DEBUG_MODE = true;
     /** Counter for number of sites.
      * Should be equal to number of times NutchTextDumpProcessor constructor
+     * Should be equal to number of times NutchTextDumpToCSV constructor
      * is called: once per site.
      */
 …
     public final boolean siteCrawlUnfinished;
     public final long siteCrawledTimestamp; /** When the crawl of the site terminated */
     private String domainOfSite;
 …
+    }
     /** A NutchTextDumpProcessor processes the dump.txt for one site */
     public NutchTextDumpProcessor(CSVPrinter webpagesCSVPrinter, CSVPrinter mriSentencesCSVPrinter,
+    /** A NutchTextDumpToCSV processes the dump.txt for one site */
+    public NutchTextDumpToCSV(CSVPrinter webpagesCSVPrinter, CSVPrinter mriSentencesCSVPrinter,
                   MaoriTextDetector maoriTxtDetector, String siteID,
                   File txtDumpFile, long lastModified, boolean siteCrawlUnfinished)
     throws IOException
+    {
     // increment static counter of sites processed by a NutchTextDumpProcessor instance
+    // increment static counter of sites processed by a NutchTextDumpToCSV instance
     SITE_COUNTER++;
 …
     prepareSiteStats(webpagesCSVPrinter, mriSentencesCSVPrinter);
+    }
+    /* UNUSED */
     /** pageID: id into pages array */
+    /*
     public boolean isPageInMaori(int pageID) throws ArrayIndexOutOfBoundsException {
 …
     return pagesContainingMRI.size();
+    }
+    */
     private void prepareSiteStats(CSVPrinter webpageCSVPrinter, CSVPrinter mriSentencesCSVPrinter) throws IOException {
 …
         // remaining elements are the actual sentences that were detected as being MÄori
         int totalSentences = Integer.parseInt(mriSentences.get(0));
         int numSentencesInMRI = mriSentences.size() - 1;
+        int numSentencesInMRI = mriSentences.size() - 1;
         // Add page to list of MRI pages if the page's body text overall was detected
 …
     public static void printUsage() {
     System.err.println("Run this program as:");
     System.err.println("\tNutchTextDumpProcessor <path to 'crawled' folder>");
+    System.err.println("\tNutchTextDumpToCSV <path to 'crawled' folder>");
+    }
 …
+    }
     NutchTextDumpProcessor.DEBUG_MODE = false;
+    NutchTextDumpToCSV.DEBUG_MODE = false;
     File websitesCSVFile = new File(sitesDir, "websites.csv");
 …
             long lastModified = siteDir.lastModified();
             logger.debug("Found siteID: " + siteID);
             NutchTextDumpProcessor nutchTxtDump = new NutchTextDumpProcessor(
+            NutchTextDumpToCSV nutchTxtDump = new NutchTextDumpToCSV(
                  webpagesCSVPrinter, mriSentencesCSVPrinter, mriTxtDetector,
                  siteID, txtDumpFile, lastModified, UNFINISHED_FILE.exists());
 …
     } catch(Exception e) {
         // can get an exception when instantiating NutchTextDumpProcessor instance
+        // can get an exception when instantiating NutchTextDumpToCSV instance
         // or with CSV file
         logger.error(e.getMessage(), e);

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/TextLanguageDetector.java

-              r33587
+              r33633
+    }
+    /** inner class */
+    public class SentenceInfo {
+    public final double confidenceLevel;
+    /** 3 letter lang code */
+    public final String langCode;
+    public final String sentence;
+    public SentenceInfo(double confidence, String langCode, String sentence) {
+        confidenceLevel = confidence;
+        this.langCode = langCode;
+        this.sentence = sentence;
+    }
+    }
+    /** TODO: Is it sensible to use the Maori Language Sentence Model to split the text
+     * into sentences? What if the text in any other language or a mix of languages?
+     * Doesn't this assume that all languages split sentences alike? */
+    public String[] getAllSentences(String text) {
+    // This function doesn't work if the sentenceDetector object wasn't set up
+    if(sentenceDetector == null) return null;
+    String[] sentences = sentenceDetector.sentDetect(text);
+    return sentences;
+    }
+    public ArrayList<SentenceInfo> getAllSentencesInfo(String[] sentences) {
+    if(sentences == null) {
+        return null;
+    }
+    ArrayList<SentenceInfo> sentencesList = new ArrayList<SentenceInfo>();
+    for(int i = 0; i < sentences.length; i++) {
+        String sentence = sentences[i];
+        //System.err.println(sentence);
+        Language bestLanguage = myCategorizer.predictLanguage(sentence);
+        double confidence = bestLanguage.getConfidence();
+        sentencesList.add(new SentenceInfo(confidence, bestLanguage, sentence));
+    }
+    return sentencesList;
+    }
+    public ArrayList<SentenceInfo> getAllOverlappingSentencesInfo(String[] sentences) {
+    if(sentences == null) {
+        return null;
+    }
+    ArrayList<SentenceInfo> sentencesList = new ArrayList<SentenceInfo>();
+    for(int i = 1; i < sentences.length; i++) {
+        // glue every two adjacent sentences together
+        String sentence = sentences[i-1];
+        String separator = ". ";
+        // if the sentence already ends with a terminating punctuation character,
+        // then separator is just a space
+        sentence = sentence.trim();
+        if(sentence.endsWith(".") || sentence.endsWith("?") || sentence.endsWith("!")) {
+        separator = " ";
+        }
+        sentence = sentence + separator + sentence[i];
+        //System.err.println(sentence);
+        Language bestLanguage = myCategorizer.predictLanguage(sentence);
+        double confidence = bestLanguage.getConfidence();
+        sentencesList.add(new SentenceInfo(confidence, bestLanguage, sentence));
+    }
+    return sentencesList;
+    }
     /**
      * In this class' constructor, need to have set up the Sentence Detection Model
 …
         if(i == NUM_LINES) { // arbitrary 100 lines read, predict language, calculate confidence
         Language bestLanguage = myCategorizer.predictLanguage(text.toString());
         if(language != null && !bestLanguage.getLang().equals(language)) { // predicted lang of current n lines not the same as predicted lang for prev n lines

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 33633

Legend:

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToCSV.java

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/TextLanguageDetector.java

Download in other formats: