source: other-projects/maori-lang-detection/src/org/greenstone/atea/AllDomainCount.java@ 33984

Last change on this file since 33984 was 33984, checked in by ak19, 4 years ago

Simple class to summarise some basic counts of the input common crawl data

File size: 4.3 KB
Line 
1package org.greenstone.atea;
2
3import java.io.*;
4import java.util.*;
5import org.apache.log4j.Logger;
6
7/**
8 * To compile, including the jars in lib/ for compiling.
9 * maori-lang-detection/src$ javac -cp ".:../lib/*" org/greenstone/atea/AllDomainCount.java
10 *
11 * To run, passing the log4j and other properties files in conf/ folder:
12 * maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.AllDomainCount
13 *
14 * OR pass one of the files keepURLs.txt|greyListed.txt|discardURLs.txt|seedURLs.txt:
15 * maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.AllDomainCount full/path/to/keepURLs.txt|greyListed.txt|discardURLs.txt|seedURLs.txt
16 *
17 * If no argument provided, then files keepURLs.txt + greyListed.txt + discardURLs.txt,
18 * representing the union of all URLs from CommonCrawl,
19 * are used as input and totals displayed are over all three files.
20 */
21public class AllDomainCount {
22 private static Logger logger = Logger.getLogger(org.greenstone.atea.AllDomainCount.class.getName());
23
24 public static String containingFolder = "../tmp/to_crawl.THE_VERSION_USED/";
25
26 // TreeSet: by default, "the elements are ordered using their natural ordering"
27 private Set<String> domains = new TreeSet<String>();
28 private Set<String> uniqueDomains = new TreeSet<String>(); // stripped of protocol and www
29 private Set<String> URLs = new TreeSet<String>();
30 private Set<String> uniqueURLs = new TreeSet<String>(); // stripped of protocol and www
31
32 private int lineCount = 0;
33 private final String[] filenames = {"discardURLs.txt", "greyListed.txt", "keepURLs.txt"};;
34
35 public AllDomainCount() {
36 for(String filename : filenames) {
37 File infile = new File(containingFolder, filename);
38
39 addStatsForFile(infile);
40 }
41 }
42
43 public AllDomainCount(File infile) {
44 addStatsForFile(infile);
45 }
46
47 public int getLineCount() { return lineCount; }
48 public int getDomainsCount() { return domains.size(); }
49 public int getUniqueDomainsCount() { return uniqueDomains.size(); }
50 public int getURLsCount() { return URLs.size(); }
51 public int getUniqueURLsCount() { return uniqueURLs.size(); }
52
53 public void addStatsForFile(File infile) {
54 try (
55 BufferedReader reader = new BufferedReader(new FileReader(infile));
56 ) {
57
58 String url = null;
59
60 while((url = reader.readLine()) != null) {
61 lineCount++;
62 URLs.add(url);
63
64 // don't care about real uniqueness of domains - Set just ensures that each
65 // domain is listed once. Keeps http(s) and www distinct
66 String domainWithProtocol = Utility.getDomainForURL(url, true);
67 domains.add(domainWithProtocol);
68
69 // unique domains - so stripped of http(s) and www
70 String basicDomain = Utility.stripProtocolAndWWWFromURL(domainWithProtocol);
71 uniqueDomains.add(basicDomain);
72
73 //System.err.println("URL: " + url);
74
75 String basicURL = Utility.stripProtocolAndWWWFromURL(url);
76 uniqueURLs.add(basicURL);
77 }
78
79 } catch(Exception e) {
80 logger.error("Got exception", e);
81 }
82 }
83
84 public static void printUsage() {
85 System.err.println("Run as:");
86 System.err.println("\tUniqueDomainCount [file listing urls]");
87 }
88
89 public static void main(String[] args) {
90
91
92 if(args.length != 0 && args.length != 1) {
93 printUsage();
94 System.exit(-1);
95 }
96
97 AllDomainCount stats = null;
98
99 if(args.length == 1) {
100 File infile = new File(args[0]);
101 if(!infile.exists()) {
102 logger.error("File " + infile + " does not exist. Terminating.");
103 System.exit(-1);
104 }
105
106 logger.info("Counting all domains and urls in " + infile.getName());
107 stats = new AllDomainCount(infile);
108 } else {
109 logger.info("Counting all domains and urls in keepURLs.txt + discardURLs.txt + greyListed.txt");
110 stats = new AllDomainCount();
111 }
112
113
114
115 //logger.info(String.format("In file %s:", infile));
116 logger.info("\tCount of unique domains: " + stats.getDomainsCount());
117 logger.info("\tCount of unique basic domains (stripped of protocol and www): " + stats.getUniqueDomainsCount());
118 logger.info("\tLine count: " + stats.getLineCount());
119 logger.info("\tActual unique URL count: " + stats.getURLsCount());
120 logger.info("\tUnique basic URL count (stripped of protocol and www): " + stats.getUniqueURLsCount());
121 logger.info("******************************************************");
122 }
123}
Note: See TracBrowser for help on using the repository browser.