1 | package org.greenstone.atea;
|
---|
2 |
|
---|
3 | import java.io.*;
|
---|
4 | import java.util.*;
|
---|
5 | import org.apache.log4j.Logger;
|
---|
6 |
|
---|
7 | /**
|
---|
8 | * To compile, including the jars in lib/ for compiling.
|
---|
9 | * maori-lang-detection/src$ javac -cp ".:../lib/*" org/greenstone/atea/AllDomainCount.java
|
---|
10 | *
|
---|
11 | * To run, passing the log4j and other properties files in conf/ folder:
|
---|
12 | * maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.AllDomainCount
|
---|
13 | *
|
---|
14 | * OR pass one of the files keepURLs.txt|greyListed.txt|discardURLs.txt|seedURLs.txt:
|
---|
15 | * maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.AllDomainCount full/path/to/keepURLs.txt|greyListed.txt|discardURLs.txt|seedURLs.txt
|
---|
16 | *
|
---|
17 | * If no argument provided, then files keepURLs.txt + greyListed.txt + discardURLs.txt,
|
---|
18 | * representing the union of all URLs from CommonCrawl,
|
---|
19 | * are used as input and totals displayed are over all three files.
|
---|
20 | */
|
---|
21 | public class AllDomainCount {
|
---|
22 | private static Logger logger = Logger.getLogger(org.greenstone.atea.AllDomainCount.class.getName());
|
---|
23 |
|
---|
24 | public static String containingFolder = "../tmp/to_crawl.THE_VERSION_USED/";
|
---|
25 |
|
---|
26 | // TreeSet: by default, "the elements are ordered using their natural ordering"
|
---|
27 | private Set<String> domains = new TreeSet<String>();
|
---|
28 | private Set<String> uniqueDomains = new TreeSet<String>(); // stripped of protocol and www
|
---|
29 | private Set<String> URLs = new TreeSet<String>();
|
---|
30 | private Set<String> uniqueURLs = new TreeSet<String>(); // stripped of protocol and www
|
---|
31 |
|
---|
32 | private int lineCount = 0;
|
---|
33 | private final String[] filenames = {"discardURLs.txt", "greyListed.txt", "keepURLs.txt"};;
|
---|
34 |
|
---|
35 | public AllDomainCount() {
|
---|
36 | for(String filename : filenames) {
|
---|
37 | File infile = new File(containingFolder, filename);
|
---|
38 |
|
---|
39 | addStatsForFile(infile);
|
---|
40 | }
|
---|
41 | }
|
---|
42 |
|
---|
43 | public AllDomainCount(File infile) {
|
---|
44 | addStatsForFile(infile);
|
---|
45 | }
|
---|
46 |
|
---|
47 | public int getLineCount() { return lineCount; }
|
---|
48 | public int getDomainsCount() { return domains.size(); }
|
---|
49 | public int getUniqueDomainsCount() { return uniqueDomains.size(); }
|
---|
50 | public int getURLsCount() { return URLs.size(); }
|
---|
51 | public int getUniqueURLsCount() { return uniqueURLs.size(); }
|
---|
52 |
|
---|
53 | public void addStatsForFile(File infile) {
|
---|
54 | try (
|
---|
55 | BufferedReader reader = new BufferedReader(new FileReader(infile));
|
---|
56 | ) {
|
---|
57 |
|
---|
58 | String url = null;
|
---|
59 |
|
---|
60 | while((url = reader.readLine()) != null) {
|
---|
61 | lineCount++;
|
---|
62 | URLs.add(url);
|
---|
63 |
|
---|
64 | // don't care about real uniqueness of domains - Set just ensures that each
|
---|
65 | // domain is listed once. Keeps http(s) and www distinct
|
---|
66 | String domainWithProtocol = Utility.getDomainForURL(url, true);
|
---|
67 | domains.add(domainWithProtocol);
|
---|
68 |
|
---|
69 | // unique domains - so stripped of http(s) and www
|
---|
70 | String basicDomain = Utility.stripProtocolAndWWWFromURL(domainWithProtocol);
|
---|
71 | uniqueDomains.add(basicDomain);
|
---|
72 |
|
---|
73 | //System.err.println("URL: " + url);
|
---|
74 |
|
---|
75 | String basicURL = Utility.stripProtocolAndWWWFromURL(url);
|
---|
76 | uniqueURLs.add(basicURL);
|
---|
77 | }
|
---|
78 |
|
---|
79 | } catch(Exception e) {
|
---|
80 | logger.error("Got exception", e);
|
---|
81 | }
|
---|
82 | }
|
---|
83 |
|
---|
84 | public static void printUsage() {
|
---|
85 | System.err.println("Run as:");
|
---|
86 | System.err.println("\tUniqueDomainCount [file listing urls]");
|
---|
87 | }
|
---|
88 |
|
---|
89 | public static void main(String[] args) {
|
---|
90 |
|
---|
91 |
|
---|
92 | if(args.length != 0 && args.length != 1) {
|
---|
93 | printUsage();
|
---|
94 | System.exit(-1);
|
---|
95 | }
|
---|
96 |
|
---|
97 | AllDomainCount stats = null;
|
---|
98 |
|
---|
99 | if(args.length == 1) {
|
---|
100 | File infile = new File(args[0]);
|
---|
101 | if(!infile.exists()) {
|
---|
102 | logger.error("File " + infile + " does not exist. Terminating.");
|
---|
103 | System.exit(-1);
|
---|
104 | }
|
---|
105 |
|
---|
106 | logger.info("Counting all domains and urls in " + infile.getName());
|
---|
107 | stats = new AllDomainCount(infile);
|
---|
108 | } else {
|
---|
109 | logger.info("Counting all domains and urls in keepURLs.txt + discardURLs.txt + greyListed.txt");
|
---|
110 | stats = new AllDomainCount();
|
---|
111 | }
|
---|
112 |
|
---|
113 |
|
---|
114 |
|
---|
115 | //logger.info(String.format("In file %s:", infile));
|
---|
116 | logger.info("\tCount of unique domains: " + stats.getDomainsCount());
|
---|
117 | logger.info("\tCount of unique basic domains (stripped of protocol and www): " + stats.getUniqueDomainsCount());
|
---|
118 | logger.info("\tLine count: " + stats.getLineCount());
|
---|
119 | logger.info("\tActual unique URL count: " + stats.getURLsCount());
|
---|
120 | logger.info("\tUnique basic URL count (stripped of protocol and www): " + stats.getUniqueURLsCount());
|
---|
121 | logger.info("******************************************************");
|
---|
122 | }
|
---|
123 | }
|
---|