Changeset 33633


Ignore:
Timestamp:
2019-11-08T19:43:39+13:00 (4 years ago)
Author:
ak19
Message:
  1. TextLanguageDetector now has methods for collecting all sentences and all overlapping sentences. 2. Renaming NutchTextDumpProcessor.java to NutchTextDumpToCSV.java, since there a new class, NutchTextDumpToMongoDB.java that needs slightly different data structures.
Location:
gs3-extensions/maori-lang-detection/src/org/greenstone/atea
Files:
2 edited
1 moved

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java

    r33626 r33633  
    4747    private static Logger logger = Logger.getLogger(org.greenstone.atea.MongoDBAccess.class.getName());
    4848   
    49     String HOST = "localhost";
    50     int PORT = 27017; // mongodb port
    51     String PROPS_FILENAME = "config.properties";
    52     String DB_NAME = "ateacrawldata";
     49    static final String PROPS_FILENAME = "config.properties";
     50    public static final String DB_NAME = "anupama"; //"ateacrawldata";
     51    public static final String WEBPAGES_COLLECTION = "webpages";
     52    public static final String WEBSITES_COLLECTION = "websites";   
    5353   
     54    private String HOST = "localhost";
     55    private int PORT = 27017; // mongodb port
    5456    private String USERNAME;
    55     private String PASSWORD;
    56 
     57    private String PASSWORD;   
    5758
    5859    private MongoClient mongo = null;
    5960    private MongoDatabase database = null;
     61   
    6062   
    6163    public MongoDBAccess() throws Exception {
     
    118120    logger.info("Credentials: "+ credential);
    119121    }
     122
    120123   
     124    public void insertWebSiteInfo(int SITE_COUNTER, int siteID, String domainOfSite,
     125           int numPages, int numPagesInMRI, int numPagesContainingMRI,
     126                  /* TODO: String geoLocationCountryCode, boolean miURL */
     127           String siteCrawledTimestamp, String siteCrawlUnfinished, boolean redoCrawl)
     128    {
     129    MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);
     130    Document document = new Document("id", SITE_COUNTER)
     131        .append("siteFolderName", siteID)
     132        .append("domain", domainOfSite)
     133        .append("totalPages", numPages)
     134        .append("numPagesInMRI", numPagesInMRI)
     135        .append("numPagesContainingMRI", numPagesContainingMRI)
     136        .append("siteCrawledTimestamp", siteCrawledTimestamp)
     137        .append("siteCrawlUnfinished", siteCrawlUnfinished)
     138        .append("redoCrawl", redoCrawl);
     139    collection.insertOne(document);
     140    System.out.println("website info inserted successfully into " + WEBSITES_COLLECTION); 
     141    }
    121142
    122     /*
    123     public void insertDocument() {
    124     MongoCollection<Document> collection = this.database.getCollection("sampleCollection");
     143   
     144    public void insertWebPage(int WEBPAGE_COUNTER, int site_id, /* id of websites_collection*/
     145                  String url, String charEncoding, String modTime, String fetchTime,
     146                  boolean isMRI, int totalSentences, int numSentencesInMRI,
     147                  ArrayList<SentenceInfo> singleSentences,
     148                  ArrayList<SentenceInfo> overlappingSentences)
     149    {
     150    // load the webpages db 'table'
     151    // in mongodb, the equivalent of db tables are called 'collections'
     152    MongoCollection<Document> collection = this.database.getCollection(WEBPAGES_COLLECTION);
     153
     154    Document document = new Document("id", WEBPAGE_COUNTER)
     155        .append("siteid", site_id)
     156        .append("url", url)
     157        .append("charEncoding", charEncoding)
     158        .append("modTime", modTime)
     159        .append("fetchTime", fetchTime)
     160        .append("isMRI", isMRI)
     161        .append("totalSentences", totalSentences)
     162        .append("numSentencesInMRI", numSentencesInMRI);
     163
     164    document.put("singleSentences", singleSentences);
     165    document.put("overlappingSentences", overlappingSentences);
     166   
     167    collection.insertOne(document);
     168    System.out.println("website info inserted successfully into " + WEBPAGES_COLLECTION); 
    125169    }
    126     */
     170   
    127171
    128172    // TODO:
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToCSV.java

    r33623 r33633  
    3838 * TO COMPILE:
    3939 *    maori-lang-detection/src$
    40  *       javac -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor.java
     40 *       javac -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToCSV.java
    4141 *
    4242 * TO RUN:
    4343 *    maori-lang-detection/src$
    44  *       java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor ../crawled-small
     44 *       java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToCSV ../crawled-small
    4545 *
    4646 * or:
    47  *       java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor ../crawled-small > ../crawled-small/bla.txt 2>&1
     47 *       java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToCSV ../crawled-small > ../crawled-small/bla.txt 2>&1
    4848 *
    4949*/
    50 public class NutchTextDumpProcessor {
    51     static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpProcessor.class.getName());
     50public class NutchTextDumpToCSV {
     51    static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpToCSV.class.getName());
    5252
    5353    static boolean DEBUG_MODE = true;
    5454
    5555    /** Counter for number of sites.
    56      * Should be equal to number of times NutchTextDumpProcessor constructor
     56     * Should be equal to number of times NutchTextDumpToCSV constructor
    5757     * is called: once per site.
    5858     */
     
    6666    public final boolean siteCrawlUnfinished;
    6767    public final long siteCrawledTimestamp; /** When the crawl of the site terminated */
    68    
     68   
    6969    private String domainOfSite;
    7070   
     
    100100    }
    101101
    102     /** A NutchTextDumpProcessor processes the dump.txt for one site */
    103     public NutchTextDumpProcessor(CSVPrinter webpagesCSVPrinter, CSVPrinter mriSentencesCSVPrinter,
     102    /** A NutchTextDumpToCSV processes the dump.txt for one site */
     103    public NutchTextDumpToCSV(CSVPrinter webpagesCSVPrinter, CSVPrinter mriSentencesCSVPrinter,
    104104                  MaoriTextDetector maoriTxtDetector, String siteID,
    105105                  File txtDumpFile, long lastModified, boolean siteCrawlUnfinished)
    106106    throws IOException
    107107    {   
    108     // increment static counter of sites processed by a NutchTextDumpProcessor instance
     108    // increment static counter of sites processed by a NutchTextDumpToCSV instance
    109109    SITE_COUNTER++;
    110110   
     
    190190    prepareSiteStats(webpagesCSVPrinter, mriSentencesCSVPrinter);
    191191    }
    192    
     192
     193    /* UNUSED */
    193194    /** pageID: id into pages array */
     195    /*
    194196    public boolean isPageInMaori(int pageID) throws ArrayIndexOutOfBoundsException {   
    195197
     
    228230    return pagesContainingMRI.size();
    229231    }
     232    */
    230233   
    231234    private void prepareSiteStats(CSVPrinter webpageCSVPrinter, CSVPrinter mriSentencesCSVPrinter) throws IOException {
     
    257260        // remaining elements are the actual sentences that were detected as being Māori
    258261        int totalSentences = Integer.parseInt(mriSentences.get(0));
    259         int numSentencesInMRI = mriSentences.size() - 1;       
     262        int numSentencesInMRI = mriSentences.size() - 1;
    260263
    261264        // Add page to list of MRI pages if the page's body text overall was detected
     
    367370    public static void printUsage() {
    368371    System.err.println("Run this program as:");
    369     System.err.println("\tNutchTextDumpProcessor <path to 'crawled' folder>");
     372    System.err.println("\tNutchTextDumpToCSV <path to 'crawled' folder>");
    370373    }
    371374   
     
    382385    }
    383386
    384     NutchTextDumpProcessor.DEBUG_MODE = false;
     387    NutchTextDumpToCSV.DEBUG_MODE = false;
    385388   
    386389    File websitesCSVFile = new File(sitesDir, "websites.csv");
     
    426429            long lastModified = siteDir.lastModified();
    427430            logger.debug("Found siteID: " + siteID);           
    428             NutchTextDumpProcessor nutchTxtDump = new NutchTextDumpProcessor(
     431            NutchTextDumpToCSV nutchTxtDump = new NutchTextDumpToCSV(
    429432                 webpagesCSVPrinter, mriSentencesCSVPrinter, mriTxtDetector,
    430433                 siteID, txtDumpFile, lastModified, UNFINISHED_FILE.exists());
     
    442445       
    443446    } catch(Exception e) {
    444         // can get an exception when instantiating NutchTextDumpProcessor instance
     447        // can get an exception when instantiating NutchTextDumpToCSV instance
    445448        // or with CSV file
    446449        logger.error(e.getMessage(), e);
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/TextLanguageDetector.java

    r33587 r33633  
    142142    }
    143143
     144    /** inner class */
     145    public class SentenceInfo {
     146    public final double confidenceLevel;
     147    /** 3 letter lang code */
     148    public final String langCode;
     149    public final String sentence;
     150
     151    public SentenceInfo(double confidence, String langCode, String sentence) {
     152        confidenceLevel = confidence;
     153        this.langCode = langCode;
     154        this.sentence = sentence;
     155    }
     156    }
     157
     158    /** TODO: Is it sensible to use the Maori Language Sentence Model to split the text
     159     * into sentences? What if the text in any other language or a mix of languages?
     160     * Doesn't this assume that all languages split sentences alike? */
     161    public String[] getAllSentences(String text) {
     162       
     163    // This function doesn't work if the sentenceDetector object wasn't set up
     164    if(sentenceDetector == null) return null;
     165
     166    String[] sentences = sentenceDetector.sentDetect(text);
     167    return sentences;
     168    }
     169 
     170    public ArrayList<SentenceInfo> getAllSentencesInfo(String[] sentences) {
     171
     172    if(sentences == null) {
     173        return null;       
     174    }
     175
     176    ArrayList<SentenceInfo> sentencesList = new ArrayList<SentenceInfo>();
     177    for(int i = 0; i < sentences.length; i++) {
     178        String sentence = sentences[i];     
     179       
     180        //System.err.println(sentence);
     181
     182        Language bestLanguage = myCategorizer.predictLanguage(sentence);
     183        double confidence = bestLanguage.getConfidence();
     184
     185        sentencesList.add(new SentenceInfo(confidence, bestLanguage, sentence));
     186    }
     187
     188    return sentencesList;
     189    }
     190
     191    public ArrayList<SentenceInfo> getAllOverlappingSentencesInfo(String[] sentences) {
     192   
     193    if(sentences == null) {
     194        return null;       
     195    }
     196
     197    ArrayList<SentenceInfo> sentencesList = new ArrayList<SentenceInfo>();
     198    for(int i = 1; i < sentences.length; i++) {
     199        // glue every two adjacent sentences together
     200        String sentence = sentences[i-1];
     201       
     202        String separator = ". ";
     203        // if the sentence already ends with a terminating punctuation character,
     204        // then separator is just a space
     205        sentence = sentence.trim();
     206        if(sentence.endsWith(".") || sentence.endsWith("?") || sentence.endsWith("!")) {
     207        separator = " ";
     208        }
     209        sentence = sentence + separator + sentence[i];
     210       
     211        //System.err.println(sentence);
     212
     213        Language bestLanguage = myCategorizer.predictLanguage(sentence);
     214        double confidence = bestLanguage.getConfidence();
     215
     216        sentencesList.add(new SentenceInfo(confidence, bestLanguage, sentence));
     217    }
     218
     219    return sentencesList;
     220    }
     221   
    144222    /**
    145223     * In this class' constructor, need to have set up the Sentence Detection Model
     
    269347        if(i == NUM_LINES) { // arbitrary 100 lines read, predict language, calculate confidence
    270348       
    271        
    272349        Language bestLanguage = myCategorizer.predictLanguage(text.toString());
    273350        if(language != null && !bestLanguage.getLang().equals(language)) { // predicted lang of current n lines not the same as predicted lang for prev n lines
Note: See TracChangeset for help on using the changeset viewer.