Changeset 33633

Show
Ignore:
Timestamp:
08.11.2019 19:43:39 (6 days ago)
Author:
ak19
Message:

1. TextLanguageDetector? now has methods for collecting all sentences and all overlapping sentences. 2. Renaming NutchTextDumpProcessor?.java to NutchTextDumpToCSV.java, since there a new class, NutchTextDumpToMongoDB.java that needs slightly different data structures.

Location:
gs3-extensions/maori-lang-detection/src/org/greenstone/atea
Files:
2 modified
1 moved

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java

    r33626 r33633  
    4747    private static Logger logger = Logger.getLogger(org.greenstone.atea.MongoDBAccess.class.getName()); 
    4848     
    49     String HOST = "localhost"; 
    50     int PORT = 27017; // mongodb port 
    51     String PROPS_FILENAME = "config.properties"; 
    52     String DB_NAME = "ateacrawldata"; 
     49    static final String PROPS_FILENAME = "config.properties"; 
     50    public static final String DB_NAME = "anupama"; //"ateacrawldata"; 
     51    public static final String WEBPAGES_COLLECTION = "webpages"; 
     52    public static final String WEBSITES_COLLECTION = "websites";     
    5353     
     54    private String HOST = "localhost"; 
     55    private int PORT = 27017; // mongodb port 
    5456    private String USERNAME; 
    55     private String PASSWORD; 
    56  
     57    private String PASSWORD;     
    5758 
    5859    private MongoClient mongo = null; 
    5960    private MongoDatabase database = null; 
     61     
    6062     
    6163    public MongoDBAccess() throws Exception { 
     
    118120    logger.info("Credentials: "+ credential); 
    119121    } 
     122 
    120123     
     124    public void insertWebSiteInfo(int SITE_COUNTER, int siteID, String domainOfSite, 
     125           int numPages, int numPagesInMRI, int numPagesContainingMRI, 
     126                  /* TODO: String geoLocationCountryCode, boolean miURL */ 
     127           String siteCrawledTimestamp, String siteCrawlUnfinished, boolean redoCrawl) 
     128    { 
     129    MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION); 
     130    Document document = new Document("id", SITE_COUNTER) 
     131        .append("siteFolderName", siteID)  
     132        .append("domain", domainOfSite)  
     133        .append("totalPages", numPages)  
     134        .append("numPagesInMRI", numPagesInMRI) 
     135        .append("numPagesContainingMRI", numPagesContainingMRI) 
     136        .append("siteCrawledTimestamp", siteCrawledTimestamp) 
     137        .append("siteCrawlUnfinished", siteCrawlUnfinished) 
     138        .append("redoCrawl", redoCrawl); 
     139    collection.insertOne(document);  
     140    System.out.println("website info inserted successfully into " + WEBSITES_COLLECTION);   
     141    } 
    121142 
    122     /* 
    123     public void insertDocument() { 
    124     MongoCollection<Document> collection = this.database.getCollection("sampleCollection"); 
     143     
     144    public void insertWebPage(int WEBPAGE_COUNTER, int site_id, /* id of websites_collection*/ 
     145                  String url, String charEncoding, String modTime, String fetchTime, 
     146                  boolean isMRI, int totalSentences, int numSentencesInMRI, 
     147                  ArrayList<SentenceInfo> singleSentences, 
     148                  ArrayList<SentenceInfo> overlappingSentences) 
     149    { 
     150    // load the webpages db 'table' 
     151    // in mongodb, the equivalent of db tables are called 'collections' 
     152    MongoCollection<Document> collection = this.database.getCollection(WEBPAGES_COLLECTION); 
     153 
     154    Document document = new Document("id", WEBPAGE_COUNTER) 
     155        .append("siteid", site_id)  
     156        .append("url", url)  
     157        .append("charEncoding", charEncoding)  
     158        .append("modTime", modTime) 
     159        .append("fetchTime", fetchTime) 
     160        .append("isMRI", isMRI) 
     161        .append("totalSentences", totalSentences) 
     162        .append("numSentencesInMRI", numSentencesInMRI); 
     163 
     164    document.put("singleSentences", singleSentences); 
     165    document.put("overlappingSentences", overlappingSentences); 
     166     
     167    collection.insertOne(document); 
     168    System.out.println("website info inserted successfully into " + WEBPAGES_COLLECTION);   
    125169    } 
    126     */ 
     170     
    127171 
    128172    // TODO: 
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToCSV.java

    r33623 r33633  
    3838 * TO COMPILE: 
    3939 *    maori-lang-detection/src$ 
    40  *       javac -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor.java 
     40 *       javac -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToCSV.java 
    4141 * 
    4242 * TO RUN: 
    4343 *    maori-lang-detection/src$ 
    44  *       java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor ../crawled-small 
     44 *       java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToCSV ../crawled-small 
    4545 * 
    4646 * or: 
    47  *       java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor ../crawled-small > ../crawled-small/bla.txt 2>&1 
     47 *       java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToCSV ../crawled-small > ../crawled-small/bla.txt 2>&1 
    4848 * 
    4949*/ 
    50 public class NutchTextDumpProcessor { 
    51     static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpProcessor.class.getName()); 
     50public class NutchTextDumpToCSV { 
     51    static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpToCSV.class.getName()); 
    5252 
    5353    static boolean DEBUG_MODE = true; 
    5454 
    5555    /** Counter for number of sites. 
    56      * Should be equal to number of times NutchTextDumpProcessor constructor 
     56     * Should be equal to number of times NutchTextDumpToCSV constructor 
    5757     * is called: once per site. 
    5858     */ 
     
    6666    public final boolean siteCrawlUnfinished; 
    6767    public final long siteCrawledTimestamp; /** When the crawl of the site terminated */ 
    68      
     68     
    6969    private String domainOfSite; 
    7070     
     
    100100    } 
    101101 
    102     /** A NutchTextDumpProcessor processes the dump.txt for one site */ 
    103     public NutchTextDumpProcessor(CSVPrinter webpagesCSVPrinter, CSVPrinter mriSentencesCSVPrinter, 
     102    /** A NutchTextDumpToCSV processes the dump.txt for one site */ 
     103    public NutchTextDumpToCSV(CSVPrinter webpagesCSVPrinter, CSVPrinter mriSentencesCSVPrinter, 
    104104                  MaoriTextDetector maoriTxtDetector, String siteID, 
    105105                  File txtDumpFile, long lastModified, boolean siteCrawlUnfinished) 
    106106    throws IOException 
    107107    {    
    108     // increment static counter of sites processed by a NutchTextDumpProcessor instance 
     108    // increment static counter of sites processed by a NutchTextDumpToCSV instance 
    109109    SITE_COUNTER++; 
    110110     
     
    190190    prepareSiteStats(webpagesCSVPrinter, mriSentencesCSVPrinter); 
    191191    } 
    192      
     192 
     193    /* UNUSED */ 
    193194    /** pageID: id into pages array */ 
     195    /* 
    194196    public boolean isPageInMaori(int pageID) throws ArrayIndexOutOfBoundsException {     
    195197 
     
    228230    return pagesContainingMRI.size(); 
    229231    } 
     232    */ 
    230233     
    231234    private void prepareSiteStats(CSVPrinter webpageCSVPrinter, CSVPrinter mriSentencesCSVPrinter) throws IOException { 
     
    257260        // remaining elements are the actual sentences that were detected as being Māori 
    258261        int totalSentences = Integer.parseInt(mriSentences.get(0)); 
    259         int numSentencesInMRI = mriSentences.size() - 1;         
     262        int numSentencesInMRI = mriSentences.size() - 1; 
    260263 
    261264        // Add page to list of MRI pages if the page's body text overall was detected 
     
    367370    public static void printUsage() { 
    368371    System.err.println("Run this program as:"); 
    369     System.err.println("\tNutchTextDumpProcessor <path to 'crawled' folder>"); 
     372    System.err.println("\tNutchTextDumpToCSV <path to 'crawled' folder>"); 
    370373    } 
    371374     
     
    382385    } 
    383386 
    384     NutchTextDumpProcessor.DEBUG_MODE = false; 
     387    NutchTextDumpToCSV.DEBUG_MODE = false; 
    385388     
    386389    File websitesCSVFile = new File(sitesDir, "websites.csv"); 
     
    426429            long lastModified = siteDir.lastModified(); 
    427430            logger.debug("Found siteID: " + siteID);             
    428             NutchTextDumpProcessor nutchTxtDump = new NutchTextDumpProcessor( 
     431            NutchTextDumpToCSV nutchTxtDump = new NutchTextDumpToCSV( 
    429432                 webpagesCSVPrinter, mriSentencesCSVPrinter, mriTxtDetector, 
    430433                 siteID, txtDumpFile, lastModified, UNFINISHED_FILE.exists()); 
     
    442445         
    443446    } catch(Exception e) { 
    444         // can get an exception when instantiating NutchTextDumpProcessor instance 
     447        // can get an exception when instantiating NutchTextDumpToCSV instance 
    445448        // or with CSV file 
    446449        logger.error(e.getMessage(), e); 
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/TextLanguageDetector.java

    r33587 r33633  
    142142    } 
    143143 
     144    /** inner class */ 
     145    public class SentenceInfo { 
     146    public final double confidenceLevel; 
     147    /** 3 letter lang code */ 
     148    public final String langCode; 
     149    public final String sentence; 
     150 
     151    public SentenceInfo(double confidence, String langCode, String sentence) { 
     152        confidenceLevel = confidence; 
     153        this.langCode = langCode; 
     154        this.sentence = sentence; 
     155    } 
     156    } 
     157 
     158    /** TODO: Is it sensible to use the Maori Language Sentence Model to split the text 
     159     * into sentences? What if the text in any other language or a mix of languages? 
     160     * Doesn't this assume that all languages split sentences alike? */ 
     161    public String[] getAllSentences(String text) { 
     162         
     163    // This function doesn't work if the sentenceDetector object wasn't set up 
     164    if(sentenceDetector == null) return null; 
     165 
     166    String[] sentences = sentenceDetector.sentDetect(text); 
     167    return sentences; 
     168    } 
     169  
     170    public ArrayList<SentenceInfo> getAllSentencesInfo(String[] sentences) { 
     171 
     172    if(sentences == null) { 
     173        return null;         
     174    } 
     175 
     176    ArrayList<SentenceInfo> sentencesList = new ArrayList<SentenceInfo>(); 
     177    for(int i = 0; i < sentences.length; i++) { 
     178        String sentence = sentences[i];      
     179         
     180        //System.err.println(sentence); 
     181 
     182        Language bestLanguage = myCategorizer.predictLanguage(sentence); 
     183        double confidence = bestLanguage.getConfidence(); 
     184 
     185        sentencesList.add(new SentenceInfo(confidence, bestLanguage, sentence)); 
     186    } 
     187 
     188    return sentencesList; 
     189    } 
     190 
     191    public ArrayList<SentenceInfo> getAllOverlappingSentencesInfo(String[] sentences) { 
     192     
     193    if(sentences == null) { 
     194        return null;         
     195    } 
     196 
     197    ArrayList<SentenceInfo> sentencesList = new ArrayList<SentenceInfo>(); 
     198    for(int i = 1; i < sentences.length; i++) { 
     199        // glue every two adjacent sentences together 
     200        String sentence = sentences[i-1]; 
     201         
     202        String separator = ". "; 
     203        // if the sentence already ends with a terminating punctuation character, 
     204        // then separator is just a space 
     205        sentence = sentence.trim(); 
     206        if(sentence.endsWith(".") || sentence.endsWith("?") || sentence.endsWith("!")) { 
     207        separator = " "; 
     208        } 
     209        sentence = sentence + separator + sentence[i];  
     210         
     211        //System.err.println(sentence); 
     212 
     213        Language bestLanguage = myCategorizer.predictLanguage(sentence); 
     214        double confidence = bestLanguage.getConfidence(); 
     215 
     216        sentencesList.add(new SentenceInfo(confidence, bestLanguage, sentence)); 
     217    } 
     218 
     219    return sentencesList; 
     220    } 
     221     
    144222    /**  
    145223     * In this class' constructor, need to have set up the Sentence Detection Model 
     
    269347        if(i == NUM_LINES) { // arbitrary 100 lines read, predict language, calculate confidence 
    270348         
    271          
    272349        Language bestLanguage = myCategorizer.predictLanguage(text.toString()); 
    273350        if(language != null && !bestLanguage.getLang().equals(language)) { // predicted lang of current n lines not the same as predicted lang for prev n lines