Changeset 33600


Ignore:
Timestamp:
2019-10-23T23:05:38+13:00 (5 years ago)
Author:
ak19
Message:

Work in progress of writing out CSV files. In future, may write the same info to MySQL DB instead. This commit only does the first of 3 tables, the websites csv file.

Location:
gs3-extensions/maori-lang-detection/src/org/greenstone/atea
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/MRIWebPageStats.java

    r33587 r33600  
    4040    return str.toString();
    4141    }
     42
     43    /** for converting to csv */
     44    public String[] toCSV() {
     45    String[] csvRecord = { Integer.toString(pageID),
     46                   siteID, // foreign key
     47                   URL,
     48                   Boolean.toString(isMRI),
     49                   Integer.toString(numSentences),
     50                   Integer.toString(numSentencesInMRI)
     51    };
     52
     53    return csvRecord;
     54    }
    4255}
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java

    r33587 r33600  
    33import java.io.*;
    44import java.lang.ArrayIndexOutOfBoundsException;
     5import java.time.LocalDateTime;
    56import java.util.ArrayList;
    67import java.util.Arrays;
    78
     9import org.apache.commons.csv.*;
    810import org.apache.log4j.Logger;
     11
    912
    1013/**
     
    3942 * TO RUN:
    4043 *    maori-lang-detection/src$
    41  *       java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor ../crawled
     44 *       java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor ../crawled-small
    4245 *
    4346 * or:
    44  *       java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor ../crawled > ../crawled/bla.txt 2>&1
     47 *       java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor ../crawled-small > ../crawled-small/bla.txt 2>&1
    4548 *
    4649*/
     
    4952
    5053    static boolean DEBUG_MODE = true;
     54
     55    /** Counter for number of sites.
     56     * Should be equal to number of times NutchTextDumpProcessor constructor
     57     * is called: once per site.
     58     */
     59    static private int SITE_COUNTER = 0;
    5160   
    5261    private final MaoriTextDetector maoriTxtDetector;
    5362   
    54     public final String siteID; // is this necessary?
     63    public final String siteID;
     64    public final boolean siteCrawlUnfinished;
     65    public final long siteCrawledTimestamp; /** When the crawl of the site terminated */
     66   
    5567    private String domainOfSite;
    5668   
     
    6072    /** list of pages in this site which were detected as being in MRI */
    6173    private ArrayList<MRIWebPageStats> pagesInMRI;
    62     /** list of pages in this site which were NOT detected as being in MRI but nevertheless
     74    /**
     75     * list of pages in this site which were NOT detected as being in MRI but nevertheless
    6376     * contain one or more sentences in MRI
    6477     */
     
    8497    }
    8598    }
    86    
    87     public NutchTextDumpProcessor(MaoriTextDetector maoriTxtDetector, String siteID, File txtDumpFile) {
     99
     100    /** A NutchTextDumpProcessor processes the dump.txt for one site */
     101    public NutchTextDumpProcessor(MaoriTextDetector maoriTxtDetector, String siteID,
     102                  File txtDumpFile, long lastModified, boolean siteCrawlUnfinished)
     103    {   
     104    // increment static counter of sites processed by a NutchTextDumpProcessor instance
     105    SITE_COUNTER++;
     106   
    88107    // siteID is of the form %5d (e.g. 00020) and is just the name of a site folder
    89108    this.siteID = siteID;
     109    this.siteCrawlUnfinished = siteCrawlUnfinished;
     110    this.siteCrawledTimestamp = lastModified;
     111   
    90112    this.maoriTxtDetector = maoriTxtDetector;
    91    
     113       
    92114    pages = new ArrayList<TextDumpPage>();
    93115
     
    195217    public int totalNumPages() {
    196218    return pages.size();
    197     }
    198 
     219    }
     220    public int getNumPagesInMRI() {
     221    return pagesInMRI.size();
     222    }   
     223    public int getNumPagesContainingMRI() {
     224    return pagesContainingMRI.size();
     225    }
     226   
    199227    private void prepareSiteStats() {
    200228    pagesInMRI = new ArrayList<MRIWebPageStats>();
     
    217245
    218246        // Even if the entire page is not found to be overall in Māori,
    219         // let's sitll inspect the sentences of the page and count how many (if any)
     247        // let's still inspect the sentences of the page and count how many (if any)
    220248        // are in te reo.
    221249        ArrayList<String> mriSentences = maoriTxtDetector.getAllSentencesInMaori(text);
     
    242270        }       
    243271        }
     272
     273       
     274       
     275       
    244276    }
    245277    }
     
    276308
    277309   
     310    public void writeSiteRecordToCSV(CSVPrinter websitesCSVPrinter) throws IOException {
     311
     312    // https://stackoverflow.com/questions/35183146/how-can-i-create-a-java-8-localdate-from-a-long-epoch-time-in-milliseconds
     313    // LocalDateTime date =
     314    //     LocalDateTime.ofInstant(Instant.ofEpochMilli(this.siteCrawledTimestamp), ZoneId.systemDefault());
     315    // String crawlTimestamp =
     316    //     date.format(DateTimeFormatter.ofPattern("yyyy-MM-dd")) + " " + date.format(DateTimeFormatter.ofPattern("HH:mm:ss"));
     317
     318    boolean redoCrawl = false;
     319    int numPagesInMRI = pagesInMRI.size();
     320    int numPagesContainingMRI = pagesContainingMRI.size();
     321       
     322    if(this.siteCrawlUnfinished) {
     323        // arbitrary decision, but need some indication that the MRI content was not close to one-off in the website
     324        if(numPagesInMRI > 2 || numPagesContainingMRI > 2) {
     325        redoCrawl = true;
     326        }
     327    }
     328   
     329    // site.csv CSV file row:
     330    // ID, siteID, domainURL, totalPagesInSite, numPagesInMRI, numPagesContainingMRI, crawlUnfinished, redoCrawl
     331    websitesCSVPrinter.printRecord(SITE_COUNTER, this.siteID, this.domainOfSite,
     332                       pages.size(), numPagesInMRI, numPagesContainingMRI,
     333                   this.siteCrawledTimestamp, this.siteCrawlUnfinished, redoCrawl);
     334    }
     335
     336    /*
     337    public void sitePagesToCSV(CSVPrinter webpageCSVPrinter, ArrayList<String> mriSentences)
     338    throws IOException
     339    {
     340    int totalSentences
     341   
     342    for(int i = 0; i < )
     343        printer.printRecord();
     344       
     345    } catch (IOException ex) {
     346        ex.printStackTrace();
     347    }
     348    }
     349    */
     350   
     351    /*
     352    public void xsitePagesToCSV(File webpageCSVFile, ArrayList<String> mriSentences) {
     353    // https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVPrinter.html
     354
     355    //CSVFormat csvFormat = CSVFormat.DEFAULT.
     356    //    withHeader("pageID", "siteID", "URL", "isMRI", "numSentences", "numSentencesInMRI");
     357   
     358    try (CSVPrinter printer = new CSVPrinter(new FileWriter(webpageCSVFile), CSVFormat.DEFAULT)) {
     359        // header
     360        //printer.printRecord("pageID", "siteID", "URL", "isMRI", "numSentences", "numSentencesInMRI");
     361        // skip first one
     362       
     363        for()
     364        printer.printRecord();
     365       
     366    } catch (IOException ex) {
     367        ex.printStackTrace();
     368    }
     369    }
     370    */
     371   
    278372    // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
    279373    public static void info(String msg) {
     
    301395    public static void printUsage() {
    302396    info("Run this program as:");
    303     info("\tNutchTextDumpProcessor <path to 'sites' folder>"); 
     397    info("\tNutchTextDumpProcessor <path to 'crawled' folder>");
    304398    }
    305399   
     
    318412    NutchTextDumpProcessor.DEBUG_MODE = false;
    319413   
    320     try {
     414    File websitesCSVFile = new File(sitesDir, "websites.csv");
     415   
     416    try (CSVPrinter websitesCSVPrinter = new CSVPrinter(new FileWriter(websitesCSVFile), CSVFormat.DEFAULT)) {
     417
     418        // print out the column headers for the websites csv file
     419        websitesCSVPrinter.printRecord("ID", "siteID", "domainURL",
     420                   "totalPagesInSite", "numPagesInMRI", "numOtherPagesContainingMRI",
     421                   "nutchCrawlTimestamp", "crawlUnfinished", "redoCrawl");
     422
     423       
    321424        MaoriTextDetector mriTxtDetector = new MaoriTextDetector(true); // true: run silent
    322425        File[] sites = sitesDir.listFiles();
     
    336439           
    337440            else {
     441            File UNFINISHED_FILE = new File(siteDir, "UNFINISHED");         
     442           
    338443            String siteID = siteDir.getName();
     444            long lastModified = siteDir.lastModified();
    339445            debug("Found siteID: " + siteID);           
    340             NutchTextDumpProcessor nutchTxtDump = new NutchTextDumpProcessor(mriTxtDetector, siteID, txtDumpFile);
     446            NutchTextDumpProcessor nutchTxtDump
     447                = new NutchTextDumpProcessor(mriTxtDetector, siteID, txtDumpFile, lastModified, UNFINISHED_FILE.exists());
    341448            // now it's parsed all the web pages in the site's text dump
    342449
     
    345452           
    346453            nutchTxtDump.printSiteStats();
     454            nutchTxtDump.writeSiteRecordToCSV(websitesCSVPrinter);
    347455            }
    348456        }
     
    352460    } catch(Exception e) {
    353461        // can get an exception when instantiating CCWETProcessor instance
     462        // or with CSV file
    354463        error(e.getMessage(), e);
    355464    }
Note: See TracChangeset for help on using the changeset viewer.