Changeset 33600

Show
Ignore:
Timestamp:
23.10.2019 23:05:38 (3 weeks ago)
Author:
ak19
Message:

Work in progress of writing out CSV files. In future, may write the same info to MySQL DB instead. This commit only does the first of 3 tables, the websites csv file.

Location:
gs3-extensions/maori-lang-detection/src/org/greenstone/atea
Files:
2 modified

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/MRIWebPageStats.java

    r33587 r33600  
    4040    return str.toString(); 
    4141    } 
     42 
     43    /** for converting to csv */ 
     44    public String[] toCSV() { 
     45    String[] csvRecord = { Integer.toString(pageID), 
     46                   siteID, // foreign key 
     47                   URL, 
     48                   Boolean.toString(isMRI), 
     49                   Integer.toString(numSentences), 
     50                   Integer.toString(numSentencesInMRI) 
     51    }; 
     52 
     53    return csvRecord; 
     54    } 
    4255} 
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java

    r33587 r33600  
    33import java.io.*; 
    44import java.lang.ArrayIndexOutOfBoundsException; 
     5import java.time.LocalDateTime; 
    56import java.util.ArrayList; 
    67import java.util.Arrays; 
    78 
     9import org.apache.commons.csv.*; 
    810import org.apache.log4j.Logger; 
     11 
    912 
    1013/** 
     
    3942 * TO RUN: 
    4043 *    maori-lang-detection/src$ 
    41  *       java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor ../crawled 
     44 *       java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor ../crawled-small 
    4245 * 
    4346 * or: 
    44  *       java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor ../crawled > ../crawled/bla.txt 2>&1 
     47 *       java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor ../crawled-small > ../crawled-small/bla.txt 2>&1 
    4548 * 
    4649*/ 
     
    4952 
    5053    static boolean DEBUG_MODE = true; 
     54 
     55    /** Counter for number of sites. 
     56     * Should be equal to number of times NutchTextDumpProcessor constructor 
     57     * is called: once per site. 
     58     */ 
     59    static private int SITE_COUNTER = 0; 
    5160     
    5261    private final MaoriTextDetector maoriTxtDetector; 
    5362     
    54     public final String siteID; // is this necessary? 
     63    public final String siteID; 
     64    public final boolean siteCrawlUnfinished; 
     65    public final long siteCrawledTimestamp; /** When the crawl of the site terminated */ 
     66     
    5567    private String domainOfSite; 
    5668     
     
    6072    /** list of pages in this site which were detected as being in MRI */ 
    6173    private ArrayList<MRIWebPageStats> pagesInMRI; 
    62     /** list of pages in this site which were NOT detected as being in MRI but nevertheless 
     74    /**  
     75     * list of pages in this site which were NOT detected as being in MRI but nevertheless 
    6376     * contain one or more sentences in MRI  
    6477     */ 
     
    8497    } 
    8598    } 
    86      
    87     public NutchTextDumpProcessor(MaoriTextDetector maoriTxtDetector, String siteID, File txtDumpFile) { 
     99 
     100    /** A NutchTextDumpProcessor processes the dump.txt for one site */ 
     101    public NutchTextDumpProcessor(MaoriTextDetector maoriTxtDetector, String siteID, 
     102                  File txtDumpFile, long lastModified, boolean siteCrawlUnfinished) 
     103    {    
     104    // increment static counter of sites processed by a NutchTextDumpProcessor instance 
     105    SITE_COUNTER++; 
     106     
    88107    // siteID is of the form %5d (e.g. 00020) and is just the name of a site folder 
    89108    this.siteID = siteID; 
     109    this.siteCrawlUnfinished = siteCrawlUnfinished; 
     110    this.siteCrawledTimestamp = lastModified; 
     111     
    90112    this.maoriTxtDetector = maoriTxtDetector; 
    91      
     113         
    92114    pages = new ArrayList<TextDumpPage>(); 
    93115 
     
    195217    public int totalNumPages() { 
    196218    return pages.size(); 
    197     }  
    198  
     219    } 
     220    public int getNumPagesInMRI() { 
     221    return pagesInMRI.size(); 
     222    }     
     223    public int getNumPagesContainingMRI() { 
     224    return pagesContainingMRI.size(); 
     225    } 
     226     
    199227    private void prepareSiteStats() { 
    200228    pagesInMRI = new ArrayList<MRIWebPageStats>(); 
     
    217245 
    218246        // Even if the entire page is not found to be overall in Māori, 
    219         // let's sitll inspect the sentences of the page and count how many (if any) 
     247        // let's still inspect the sentences of the page and count how many (if any) 
    220248        // are in te reo. 
    221249        ArrayList<String> mriSentences = maoriTxtDetector.getAllSentencesInMaori(text); 
     
    242270        }        
    243271        } 
     272 
     273         
     274         
     275         
    244276    } 
    245277    } 
     
    276308 
    277309     
     310    public void writeSiteRecordToCSV(CSVPrinter websitesCSVPrinter) throws IOException { 
     311 
     312    // https://stackoverflow.com/questions/35183146/how-can-i-create-a-java-8-localdate-from-a-long-epoch-time-in-milliseconds 
     313    // LocalDateTime date = 
     314    //     LocalDateTime.ofInstant(Instant.ofEpochMilli(this.siteCrawledTimestamp), ZoneId.systemDefault()); 
     315    // String crawlTimestamp = 
     316    //     date.format(DateTimeFormatter.ofPattern("yyyy-MM-dd")) + " " + date.format(DateTimeFormatter.ofPattern("HH:mm:ss")); 
     317 
     318    boolean redoCrawl = false; 
     319    int numPagesInMRI = pagesInMRI.size(); 
     320    int numPagesContainingMRI = pagesContainingMRI.size(); 
     321         
     322    if(this.siteCrawlUnfinished) { 
     323        // arbitrary decision, but need some indication that the MRI content was not close to one-off in the website 
     324        if(numPagesInMRI > 2 || numPagesContainingMRI > 2) {  
     325        redoCrawl = true; 
     326        } 
     327    } 
     328     
     329    // site.csv CSV file row: 
     330    // ID, siteID, domainURL, totalPagesInSite, numPagesInMRI, numPagesContainingMRI, crawlUnfinished, redoCrawl 
     331    websitesCSVPrinter.printRecord(SITE_COUNTER, this.siteID, this.domainOfSite, 
     332                       pages.size(), numPagesInMRI, numPagesContainingMRI, 
     333                   this.siteCrawledTimestamp, this.siteCrawlUnfinished, redoCrawl); 
     334    } 
     335 
     336    /* 
     337    public void sitePagesToCSV(CSVPrinter webpageCSVPrinter, ArrayList<String> mriSentences) 
     338    throws IOException 
     339    { 
     340    int totalSentences 
     341     
     342    for(int i = 0; i < ) 
     343        printer.printRecord(); 
     344         
     345    } catch (IOException ex) { 
     346        ex.printStackTrace(); 
     347    } 
     348    } 
     349    */ 
     350     
     351    /* 
     352    public void xsitePagesToCSV(File webpageCSVFile, ArrayList<String> mriSentences) { 
     353    // https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVPrinter.html 
     354 
     355    //CSVFormat csvFormat = CSVFormat.DEFAULT. 
     356    //    withHeader("pageID", "siteID", "URL", "isMRI", "numSentences", "numSentencesInMRI"); 
     357     
     358    try (CSVPrinter printer = new CSVPrinter(new FileWriter(webpageCSVFile), CSVFormat.DEFAULT)) { 
     359        // header 
     360        //printer.printRecord("pageID", "siteID", "URL", "isMRI", "numSentences", "numSentencesInMRI"); 
     361        // skip first one 
     362         
     363        for() 
     364        printer.printRecord(); 
     365         
     366    } catch (IOException ex) { 
     367        ex.printStackTrace(); 
     368    } 
     369    } 
     370    */ 
     371     
    278372    // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- // 
    279373    public static void info(String msg) { 
     
    301395    public static void printUsage() { 
    302396    info("Run this program as:"); 
    303     info("\tNutchTextDumpProcessor <path to 'sites' folder>");   
     397    info("\tNutchTextDumpProcessor <path to 'crawled' folder>"); 
    304398    } 
    305399     
     
    318412    NutchTextDumpProcessor.DEBUG_MODE = false; 
    319413     
    320     try { 
     414    File websitesCSVFile = new File(sitesDir, "websites.csv"); 
     415     
     416    try (CSVPrinter websitesCSVPrinter = new CSVPrinter(new FileWriter(websitesCSVFile), CSVFormat.DEFAULT)) { 
     417 
     418        // print out the column headers for the websites csv file 
     419        websitesCSVPrinter.printRecord("ID", "siteID", "domainURL", 
     420                   "totalPagesInSite", "numPagesInMRI", "numOtherPagesContainingMRI", 
     421                   "nutchCrawlTimestamp", "crawlUnfinished", "redoCrawl"); 
     422 
     423         
    321424        MaoriTextDetector mriTxtDetector = new MaoriTextDetector(true); // true: run silent 
    322425        File[] sites = sitesDir.listFiles(); 
     
    336439             
    337440            else { 
     441            File UNFINISHED_FILE = new File(siteDir, "UNFINISHED");          
     442             
    338443            String siteID = siteDir.getName(); 
     444            long lastModified = siteDir.lastModified(); 
    339445            debug("Found siteID: " + siteID);            
    340             NutchTextDumpProcessor nutchTxtDump = new NutchTextDumpProcessor(mriTxtDetector, siteID, txtDumpFile); 
     446            NutchTextDumpProcessor nutchTxtDump 
     447                = new NutchTextDumpProcessor(mriTxtDetector, siteID, txtDumpFile, lastModified, UNFINISHED_FILE.exists()); 
    341448            // now it's parsed all the web pages in the site's text dump 
    342449 
     
    345452             
    346453            nutchTxtDump.printSiteStats(); 
     454            nutchTxtDump.writeSiteRecordToCSV(websitesCSVPrinter); 
    347455            } 
    348456        } 
     
    352460    } catch(Exception e) { 
    353461        // can get an exception when instantiating CCWETProcessor instance 
     462        // or with CSV file 
    354463        error(e.getMessage(), e); 
    355464    }