Changeset 33657

Show
Ignore:
Timestamp:
12.11.2019 21:33:57 (3 weeks ago)
Author:
ak19
Message:

Some fixes after brief testing against 1/3 of the crawl. Restarted processing of crawledNode2 set of crawls.

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java

    r33656 r33657  
    171171        // process final webpage record: 
    172172        //debugPageDump(pageDump); 
    173          
    174         TextDumpPage page = new TextDumpPage(siteID, pageDump.toString()); 
    175         pages.add(page); 
    176         pageDump = null; 
    177  
    178         // for every site, we just need to work out if any of its pages 
    179         // contains /mi(/) in its URL 
    180         String url = page.getPageURL(); 
    181         if(!this.urlContainsLangCodeInPath && (url.contains("/mi/") || url.endsWith("/mi"))) { 
    182         this.urlContainsLangCodeInPath = true; 
     173 
     174        if(pageDump == null) { 
     175        logger.warn("siteID " + siteID + " had an empty dump.txt file. Reinspect site."); 
     176        } else { 
     177        TextDumpPage page = new TextDumpPage(siteID, pageDump.toString()); 
     178        pages.add(page); 
     179        pageDump = null; 
     180 
     181        // for every site, we just need to work out if any of its pages 
     182        // contains /mi(/) in its URL 
     183        String url = page.getPageURL(); 
     184        if(!this.urlContainsLangCodeInPath && (url.contains("/mi/") || url.endsWith("/mi"))) { 
     185            this.urlContainsLangCodeInPath = true; 
     186        } 
    183187        } 
    184188         
     
    306310    File geoLiteCityDatFile = new File(this.getClass().getClassLoader().getResource("GeoLiteCity.dat").getFile()); 
    307311    try { 
    308         this.geoLocationCountryCode = Utility.getCountryCodeOfDomain(this.domainOfSite, geoLiteCityDatFile); 
    309     } catch(Exception e) { 
    310         e.printStackTrace(); 
     312        if(this.domainOfSite.equals("UNKNOWN")) { 
     313        this.geoLocationCountryCode = "UNKNOWN"; 
     314        } else { 
     315        this.geoLocationCountryCode = Utility.getCountryCodeOfDomain(this.domainOfSite, geoLiteCityDatFile); 
     316        } 
     317    } catch(Exception e) {       
     318        logger.error("*** For SiteID " + siteID + ", got exception: "  + e.getMessage(), e); 
    311319        this.geoLocationCountryCode = null; 
    312320    }