Ignore:
Timestamp:
2019-12-13T18:40:46+13:00 (4 years ago)
Author:
ak19
Message:
  1. NutchTextDumpToMongoDB Added an extra field to each document in Websites mongodb collection: numPagesContainingMRI. 2. Bugfix to yesterday's commit: performing a substring() was off by one.
File:
1 edited

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java

    r33800 r33801  
    7878    private String domainOfSite;
    7979    private int numPagesInMRI = 0;
     80    private int numPagesContainingMRI = 0;
    8081   
    8182    /** keep a list to store the text of each page */
     
    8687    /** Number of language and confidence results to return for storing in MongoDB
    8788     * MongoDB runs out of space if storing too many, as we store this info per sentence
    88      * and a long text document becomes a very large MongoDB document presumable*/
     89     * and a long text document becomes a very large MongoDB document presumably */
    8990    private static final int NUM_TOP_LANGUAGES = 3; // 103 max, in current version of opennlp lang model
    9091
     
    262263        webpage.setMRISentenceCount(numSentencesInMRI);
    263264        webpage.setContainsMRI((numSentencesInMRI > 0));
    264        
     265        if(numSentencesInMRI > 0) { // if(numSentencesInMRI >= 5) {
     266            // Not sure if we can trust that a single sentence detected as Maori on a page is really Maori
     267            // But if at least 5 sentences are detected as Maori, it is more likely to be the case to be MRI?
     268            numPagesContainingMRI++;
     269        }
     270           
    265271        //mongodbAccess.insertWebpageInfo(webpage);
    266272        // Uses morphia to save to mongodb, see https://www.baeldung.com/mongodb-morphia
     
    298304    } catch(Exception e) {     
    299305        logger.error("*** For SiteID " + siteID + ", got exception: "  + e.getMessage(), e);
    300         this.geoLocationCountryCode = "UNKNOWN"; // couldn't get the country code, so should also be UNKNOWN not null
     306
     307        //if(this.domainOfSite.endsWith(".nz")) { // nz TLDs are worth counting
     308        //this.geoLocationCountryCode = "NZ";
     309        //}
     310
     311        // Help along identification of domain's country by construing TLDs if 2 letters after last period mark
     312        int periodIndex = domainOfSite.length()-3;
     313        // .com|org etc extensions that have 3 chars afte period mark will remain unknown
     314        // 2 letter extensions will be considered TLD
     315        if(periodIndex >=0 && domainOfSite.charAt(periodIndex) == '.' && ((periodIndex+1) < domainOfSite.length())) {
     316        // has a 2 letter TLD. Make it uppercase to match return value of Utility.getCountryCodeOfDomain() above
     317        String TLD = domainOfSite.substring(periodIndex+1);
     318        this.geoLocationCountryCode = TLD.toUpperCase();
     319        } else {
     320        this.geoLocationCountryCode = "UNKNOWN"; // couldn't get the country code, so should also be UNKNOWN not null
     321        }
    301322    }
    302323
     
    304325
    305326    WebsiteInfo website = new WebsiteInfo(/*SITE_COUNTER,*/ this.siteID, this.domainOfSite,
    306           totalPages, this.countOfWebPagesWithBodyText, this.numPagesInMRI,
     327          totalPages, this.countOfWebPagesWithBodyText,
     328          this.numPagesInMRI, this.numPagesContainingMRI,
    307329          this.siteCrawledTimestamp, this.siteCrawlUnfinished, redoCrawl,
    308330          this.geoLocationCountryCode, this.urlContainsLangCodeInPath);
Note: See TracChangeset for help on using the changeset viewer.