package org.greenstone.atea; import java.io.*; import java.util.*; import org.apache.log4j.Logger; /** * To compile, including the jars in lib/ for compiling. * maori-lang-detection/src$ javac -cp ".:../lib/*" org/greenstone/atea/AllDomainCount.java * * To run, passing the log4j and other properties files in conf/ folder: * maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.AllDomainCount * * OR pass one of the files keepURLs.txt|greyListed.txt|discardURLs.txt|seedURLs.txt: * maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.AllDomainCount full/path/to/keepURLs.txt|greyListed.txt|discardURLs.txt|seedURLs.txt * * If no argument provided, then files keepURLs.txt + greyListed.txt + discardURLs.txt, * representing the union of all URLs from CommonCrawl, * are used as input and totals displayed are over all three files. */ public class AllDomainCount { private static Logger logger = Logger.getLogger(org.greenstone.atea.AllDomainCount.class.getName()); public static String containingFolder = "../tmp/to_crawl.THE_VERSION_USED/"; // TreeSet: by default, "the elements are ordered using their natural ordering" private Set domains = new TreeSet(); private Set uniqueDomains = new TreeSet(); // stripped of protocol and www private Set URLs = new TreeSet(); private Set uniqueURLs = new TreeSet(); // stripped of protocol and www // just for debugging private Map domainToInfile = new HashMap(); private int lineCount = 0; private final String[] filenames = {"discardURLs.txt", "greyListed.txt", "keepURLs.txt"};; public AllDomainCount() { for(String filename : filenames) { File infile = new File(containingFolder, filename); addStatsForFile(infile); } } public AllDomainCount(File infile) { addStatsForFile(infile); } public int getLineCount() { return lineCount; } public int getDomainsCount() { return domains.size(); } public int getUniqueDomainsCount() { return uniqueDomains.size(); } public int getURLsCount() { return URLs.size(); } public int getUniqueURLsCount() { return uniqueURLs.size(); } public void addStatsForFile(File infile) { try ( BufferedReader reader = new BufferedReader(new FileReader(infile)); ) { String url = null; while((url = reader.readLine()) != null) { lineCount++; URLs.add(url); // don't care about real uniqueness of domains - Set just ensures that each // domain is listed once. Keeps http(s) and www distinct String domainWithProtocol = Utility.getDomainForURL(url, true); domains.add(domainWithProtocol); // START DEBUGGING - to inspect why domains tend to be repeated // between (a) discardURLs and greylisted/keepURLs and // (b) greylisted and keepURLs. // Because (a) some pages on greylisted or even accepted sites contain // less than minimum content and end up in discarded urls list. // And (b) some domains contain URLs with /product-... suffix // and are greylisted but the same domain can contain URLs without a /product // pattern and end up as part of keepURLs. if(!domainToInfile.containsKey(domainWithProtocol)) { domainToInfile.put(domainWithProtocol, infile.getPath()); } else { String oldInfile = domainToInfile.get(domainWithProtocol); if(!oldInfile.equals(infile.getPath())) { System.err.println("\t*** " + domainWithProtocol + " is repeated between old " + oldInfile + " and new " + infile.getPath()); } } // END DEBUGGING // unique domains - so stripped of http(s) and www String basicDomain = Utility.stripProtocolAndWWWFromURL(domainWithProtocol); uniqueDomains.add(basicDomain); //System.err.println("URL: " + url); String basicURL = Utility.stripProtocolAndWWWFromURL(url); uniqueURLs.add(basicURL); } } catch(Exception e) { logger.error("Got exception", e); } } public static void printUsage() { System.err.println("Run as:"); System.err.println("\tUniqueDomainCount [file listing urls]"); } public static void main(String[] args) { if(args.length != 0 && args.length != 1) { printUsage(); System.exit(-1); } AllDomainCount stats = null; if(args.length == 1) { File infile = new File(args[0]); if(!infile.exists()) { logger.error("File " + infile + " does not exist. Terminating."); System.exit(-1); } logger.info("Counting all domains and urls in " + infile.getName()); stats = new AllDomainCount(infile); } else { logger.info("Counting all domains and urls in keepURLs.txt + discardURLs.txt + greyListed.txt"); stats = new AllDomainCount(); } //logger.info(String.format("In file %s:", infile)); logger.info("\tCount of unique domains: " + stats.getDomainsCount()); logger.info("\tCount of unique basic domains (stripped of protocol and www): " + stats.getUniqueDomainsCount()); logger.info("\tLine count: " + stats.getLineCount()); logger.info("\tActual unique URL count: " + stats.getURLsCount()); logger.info("\tUnique basic URL count (stripped of protocol and www): " + stats.getUniqueURLsCount()); logger.info("******************************************************"); } }