Changeset 33657 for other-projects


Ignore:
Timestamp:
2019-11-12T21:33:57+13:00 (4 years ago)
Author:
ak19
Message:

Some fixes after brief testing against 1/3 of the crawl. Restarted processing of crawledNode2 set of crawls.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java

    r33656 r33657  
    171171        // process final webpage record:
    172172        //debugPageDump(pageDump);
    173        
    174         TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
    175         pages.add(page);
    176         pageDump = null;
    177 
    178         // for every site, we just need to work out if any of its pages
    179         // contains /mi(/) in its URL
    180         String url = page.getPageURL();
    181         if(!this.urlContainsLangCodeInPath && (url.contains("/mi/") || url.endsWith("/mi"))) {
    182         this.urlContainsLangCodeInPath = true;
     173
     174        if(pageDump == null) {
     175        logger.warn("siteID " + siteID + " had an empty dump.txt file. Reinspect site.");
     176        } else {
     177        TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
     178        pages.add(page);
     179        pageDump = null;
     180
     181        // for every site, we just need to work out if any of its pages
     182        // contains /mi(/) in its URL
     183        String url = page.getPageURL();
     184        if(!this.urlContainsLangCodeInPath && (url.contains("/mi/") || url.endsWith("/mi"))) {
     185            this.urlContainsLangCodeInPath = true;
     186        }
    183187        }
    184188       
     
    306310    File geoLiteCityDatFile = new File(this.getClass().getClassLoader().getResource("GeoLiteCity.dat").getFile());
    307311    try {
    308         this.geoLocationCountryCode = Utility.getCountryCodeOfDomain(this.domainOfSite, geoLiteCityDatFile);
    309     } catch(Exception e) {
    310         e.printStackTrace();
     312        if(this.domainOfSite.equals("UNKNOWN")) {
     313        this.geoLocationCountryCode = "UNKNOWN";
     314        } else {
     315        this.geoLocationCountryCode = Utility.getCountryCodeOfDomain(this.domainOfSite, geoLiteCityDatFile);
     316        }
     317    } catch(Exception e) {     
     318        logger.error("*** For SiteID " + siteID + ", got exception: "  + e.getMessage(), e);
    311319        this.geoLocationCountryCode = null;
    312320    }
Note: See TracChangeset for help on using the changeset viewer.