Changeset 33801


Ignore:
Timestamp:
2019-12-13T18:40:46+13:00 (4 years ago)
Author:
ak19
Message:
  1. NutchTextDumpToMongoDB Added an extra field to each document in Websites mongodb collection: numPagesContainingMRI. 2. Bugfix to yesterday's commit: performing a substring() was off by one.
Location:
other-projects/maori-lang-detection/src/org/greenstone/atea
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java

    r33800 r33801  
    7878    private String domainOfSite;
    7979    private int numPagesInMRI = 0;
     80    private int numPagesContainingMRI = 0;
    8081   
    8182    /** keep a list to store the text of each page */
     
    8687    /** Number of language and confidence results to return for storing in MongoDB
    8788     * MongoDB runs out of space if storing too many, as we store this info per sentence
    88      * and a long text document becomes a very large MongoDB document presumable*/
     89     * and a long text document becomes a very large MongoDB document presumably */
    8990    private static final int NUM_TOP_LANGUAGES = 3; // 103 max, in current version of opennlp lang model
    9091
     
    262263        webpage.setMRISentenceCount(numSentencesInMRI);
    263264        webpage.setContainsMRI((numSentencesInMRI > 0));
    264        
     265        if(numSentencesInMRI > 0) { // if(numSentencesInMRI >= 5) {
     266            // Not sure if we can trust that a single sentence detected as Maori on a page is really Maori
     267            // But if at least 5 sentences are detected as Maori, it is more likely to be the case to be MRI?
     268            numPagesContainingMRI++;
     269        }
     270           
    265271        //mongodbAccess.insertWebpageInfo(webpage);
    266272        // Uses morphia to save to mongodb, see https://www.baeldung.com/mongodb-morphia
     
    298304    } catch(Exception e) {     
    299305        logger.error("*** For SiteID " + siteID + ", got exception: "  + e.getMessage(), e);
    300         this.geoLocationCountryCode = "UNKNOWN"; // couldn't get the country code, so should also be UNKNOWN not null
     306
     307        //if(this.domainOfSite.endsWith(".nz")) { // nz TLDs are worth counting
     308        //this.geoLocationCountryCode = "NZ";
     309        //}
     310
     311        // Help along identification of domain's country by construing TLDs if 2 letters after last period mark
     312        int periodIndex = domainOfSite.length()-3;
     313        // .com|org etc extensions that have 3 chars afte period mark will remain unknown
     314        // 2 letter extensions will be considered TLD
     315        if(periodIndex >=0 && domainOfSite.charAt(periodIndex) == '.' && ((periodIndex+1) < domainOfSite.length())) {
     316        // has a 2 letter TLD. Make it uppercase to match return value of Utility.getCountryCodeOfDomain() above
     317        String TLD = domainOfSite.substring(periodIndex+1);
     318        this.geoLocationCountryCode = TLD.toUpperCase();
     319        } else {
     320        this.geoLocationCountryCode = "UNKNOWN"; // couldn't get the country code, so should also be UNKNOWN not null
     321        }
    301322    }
    302323
     
    304325
    305326    WebsiteInfo website = new WebsiteInfo(/*SITE_COUNTER,*/ this.siteID, this.domainOfSite,
    306           totalPages, this.countOfWebPagesWithBodyText, this.numPagesInMRI,
     327          totalPages, this.countOfWebPagesWithBodyText,
     328          this.numPagesInMRI, this.numPagesContainingMRI,
    307329          this.siteCrawledTimestamp, this.siteCrawlUnfinished, redoCrawl,
    308330          this.geoLocationCountryCode, this.urlContainsLangCodeInPath);
  • other-projects/maori-lang-detection/src/org/greenstone/atea/morphia/WebsiteInfo.java

    r33698 r33801  
    1212    public final int totalPages;
    1313    public final int countOfWebPagesWithBodyText;
     14   
    1415    public final int numPagesInMRI;
     16    public final int numPagesContainingMRI;
    1517   
    1618    public final long siteCrawledTimestamp;
     
    2224   
    2325    public WebsiteInfo(/*int siteCount,*/ String siteFolderName, String domainOfSite,
    24                int totalPages, int countOfWebPagesWithBodyText, int numPagesInMRI,
     26               int totalPages, int countOfWebPagesWithBodyText,
     27               int numPagesInMRI, int numPagesContainingMRI,
    2528               long siteCrawledTimestamp, boolean siteCrawlUnfinished, boolean redoCrawl,
    2629               String geoLocationCountryCode, boolean urlContainsLangCodeInPath)
     
    3235    this.totalPages = totalPages;
    3336    this.countOfWebPagesWithBodyText = countOfWebPagesWithBodyText;
     37   
    3438    this.numPagesInMRI = numPagesInMRI;
     39    this.numPagesContainingMRI = numPagesContainingMRI;
    3540   
    3641    this.siteCrawledTimestamp = siteCrawledTimestamp;
Note: See TracChangeset for help on using the changeset viewer.