1 | package org.greenstone.atea;
|
---|
2 |
|
---|
3 | import java.io.*;
|
---|
4 | import java.util.*;
|
---|
5 | import org.apache.log4j.Logger;
|
---|
6 |
|
---|
7 | /**
|
---|
8 | * To compile, including the jars in lib/ for compiling.
|
---|
9 | * maori-lang-detection/src$ javac -cp ".:../lib/*" org/greenstone/atea/AllDomainCount.java
|
---|
10 | *
|
---|
11 | * To run, passing the log4j and other properties files in conf/ folder:
|
---|
12 | * maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.AllDomainCount
|
---|
13 | *
|
---|
14 | * OR pass one of the files keepURLs.txt|greyListed.txt|discardURLs.txt|seedURLs.txt:
|
---|
15 | * maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.AllDomainCount full/path/to/keepURLs.txt|greyListed.txt|discardURLs.txt|seedURLs.txt
|
---|
16 | *
|
---|
17 | * If no argument provided, then files keepURLs.txt + greyListed.txt + discardURLs.txt,
|
---|
18 | * representing the union of all URLs from CommonCrawl,
|
---|
19 | * are used as input and totals displayed are over all three files.
|
---|
20 | */
|
---|
21 | public class AllDomainCount {
|
---|
22 | private static Logger logger = Logger.getLogger(org.greenstone.atea.AllDomainCount.class.getName());
|
---|
23 |
|
---|
24 | public static String containingFolder = "../tmp/to_crawl.THE_VERSION_USED/";
|
---|
25 |
|
---|
26 | // TreeSet: by default, "the elements are ordered using their natural ordering"
|
---|
27 | private Set<String> domains = new TreeSet<String>();
|
---|
28 | private Set<String> uniqueDomains = new TreeSet<String>(); // stripped of protocol and www
|
---|
29 | private Set<String> URLs = new TreeSet<String>();
|
---|
30 | private Set<String> uniqueURLs = new TreeSet<String>(); // stripped of protocol and www
|
---|
31 |
|
---|
32 | // just for debugging
|
---|
33 | private Map<String, String> domainToInfile = new HashMap<String,String>();
|
---|
34 |
|
---|
35 | private int lineCount = 0;
|
---|
36 | private final String[] filenames = {"discardURLs.txt", "greyListed.txt", "keepURLs.txt"};;
|
---|
37 |
|
---|
38 | public AllDomainCount() {
|
---|
39 | for(String filename : filenames) {
|
---|
40 | File infile = new File(containingFolder, filename);
|
---|
41 |
|
---|
42 | addStatsForFile(infile);
|
---|
43 | }
|
---|
44 | }
|
---|
45 |
|
---|
46 | public AllDomainCount(File infile) {
|
---|
47 | addStatsForFile(infile);
|
---|
48 | }
|
---|
49 |
|
---|
50 | public int getLineCount() { return lineCount; }
|
---|
51 | public int getDomainsCount() { return domains.size(); }
|
---|
52 | public int getUniqueDomainsCount() { return uniqueDomains.size(); }
|
---|
53 | public int getURLsCount() { return URLs.size(); }
|
---|
54 | public int getUniqueURLsCount() { return uniqueURLs.size(); }
|
---|
55 |
|
---|
56 | public void addStatsForFile(File infile) {
|
---|
57 | try (
|
---|
58 | BufferedReader reader = new BufferedReader(new FileReader(infile));
|
---|
59 | ) {
|
---|
60 |
|
---|
61 | String url = null;
|
---|
62 |
|
---|
63 | while((url = reader.readLine()) != null) {
|
---|
64 | lineCount++;
|
---|
65 | URLs.add(url);
|
---|
66 |
|
---|
67 | // don't care about real uniqueness of domains - Set just ensures that each
|
---|
68 | // domain is listed once. Keeps http(s) and www distinct
|
---|
69 | String domainWithProtocol = Utility.getDomainForURL(url, true);
|
---|
70 | domains.add(domainWithProtocol);
|
---|
71 |
|
---|
72 | // START DEBUGGING - to inspect why domains tend to be repeated
|
---|
73 | // between (a) discardURLs and greylisted/keepURLs and
|
---|
74 | // (b) greylisted and keepURLs.
|
---|
75 | // Because (a) some pages on greylisted or even accepted sites contain
|
---|
76 | // less than minimum content and end up in discarded urls list.
|
---|
77 | // And (b) some domains contain URLs with /product-... suffix
|
---|
78 | // and are greylisted but the same domain can contain URLs without a /product
|
---|
79 | // pattern and end up as part of keepURLs.
|
---|
80 | if(!domainToInfile.containsKey(domainWithProtocol)) {
|
---|
81 | domainToInfile.put(domainWithProtocol, infile.getPath());
|
---|
82 | } else {
|
---|
83 | String oldInfile = domainToInfile.get(domainWithProtocol);
|
---|
84 | if(!oldInfile.equals(infile.getPath())) {
|
---|
85 | System.err.println("\t*** " + domainWithProtocol + " is repeated between old " + oldInfile + " and new " + infile.getPath());
|
---|
86 | }
|
---|
87 | }
|
---|
88 | // END DEBUGGING
|
---|
89 |
|
---|
90 | // unique domains - so stripped of http(s) and www
|
---|
91 | String basicDomain = Utility.stripProtocolAndWWWFromURL(domainWithProtocol);
|
---|
92 | uniqueDomains.add(basicDomain);
|
---|
93 |
|
---|
94 | //System.err.println("URL: " + url);
|
---|
95 |
|
---|
96 | String basicURL = Utility.stripProtocolAndWWWFromURL(url);
|
---|
97 | uniqueURLs.add(basicURL);
|
---|
98 | }
|
---|
99 |
|
---|
100 | } catch(Exception e) {
|
---|
101 | logger.error("Got exception", e);
|
---|
102 | }
|
---|
103 | }
|
---|
104 |
|
---|
105 | public static void printUsage() {
|
---|
106 | System.err.println("Run as:");
|
---|
107 | System.err.println("\tUniqueDomainCount [file listing urls]");
|
---|
108 | }
|
---|
109 |
|
---|
110 | public static void main(String[] args) {
|
---|
111 |
|
---|
112 |
|
---|
113 | if(args.length != 0 && args.length != 1) {
|
---|
114 | printUsage();
|
---|
115 | System.exit(-1);
|
---|
116 | }
|
---|
117 |
|
---|
118 | AllDomainCount stats = null;
|
---|
119 |
|
---|
120 | if(args.length == 1) {
|
---|
121 | File infile = new File(args[0]);
|
---|
122 | if(!infile.exists()) {
|
---|
123 | logger.error("File " + infile + " does not exist. Terminating.");
|
---|
124 | System.exit(-1);
|
---|
125 | }
|
---|
126 |
|
---|
127 | logger.info("Counting all domains and urls in " + infile.getName());
|
---|
128 | stats = new AllDomainCount(infile);
|
---|
129 | } else {
|
---|
130 | logger.info("Counting all domains and urls in keepURLs.txt + discardURLs.txt + greyListed.txt");
|
---|
131 | stats = new AllDomainCount();
|
---|
132 | }
|
---|
133 |
|
---|
134 |
|
---|
135 |
|
---|
136 | //logger.info(String.format("In file %s:", infile));
|
---|
137 | logger.info("\tCount of unique domains: " + stats.getDomainsCount());
|
---|
138 | logger.info("\tCount of unique basic domains (stripped of protocol and www): " + stats.getUniqueDomainsCount());
|
---|
139 | logger.info("\tLine count: " + stats.getLineCount());
|
---|
140 | logger.info("\tActual unique URL count: " + stats.getURLsCount());
|
---|
141 | logger.info("\tUnique basic URL count (stripped of protocol and www): " + stats.getUniqueURLsCount());
|
---|
142 | logger.info("******************************************************");
|
---|
143 | }
|
---|
144 | }
|
---|