Changeset 33906 for other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java
- Timestamp:
- 2020-02-05T23:36:37+13:00 (4 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java
r33811 r33906 75 75 private String geoLocationCountryCode = null; /** 2 letter country code */ 76 76 private boolean urlContainsLangCodeInPath = false; /** If any URL on this site contains a /mi(/) or http(s)://mi.* in its URL path */ 77 77 78 78 private String domainOfSite; 79 private String baseSiteDomain; // domainOfSite stripped of any http(s)://www. 79 80 private int numPagesInMRI = 0; 80 81 private int numPagesContainingMRI = 0; … … 202 203 String url = firstPage.getPageURL(); 203 204 this.domainOfSite = Utility.getDomainForURL(url, true); 205 this.baseSiteDomain = Utility.stripProtocolAndWWWFromURL(this.domainOfSite); 204 206 } 205 207 else { 206 208 this.domainOfSite = "UNKNOWN"; 209 this.baseSiteDomain = "UNKNOWN"; 207 210 } 208 211 … … 339 342 int totalPages = pages.size(); 340 343 341 WebsiteInfo website = new WebsiteInfo(/*SITE_COUNTER,*/ this.siteID, this.domainOfSite, 344 WebsiteInfo website = new WebsiteInfo(/*SITE_COUNTER,*/ this.siteID, 345 this.domainOfSite, this.baseSiteDomain, 342 346 totalPages, this.countOfWebPagesWithBodyText, 343 347 this.numPagesInMRI, this.numPagesContainingMRI,
Note:
See TracChangeset
for help on using the changeset viewer.