source: other-projects/maori-lang-detection/src/org/greenstone/atea/Utility.java@ 33810

Last change on this file since 33810 was 33666, checked in by ak19, 5 years ago

Having finished sending all the crawl data to mongodb 1. Recrawled the 2 sites which I had earlier noted required recrawling 00152, 00332. 00152 required changes to how it needed to be crawled. MP3 files needed to be blocked, as there were HBase error messages about key values being too large. 2. Modified the regex-urlfilter.GS_TEMPLATE file for this to block mp3 files in general for future crawls too (in the location of the file where jpg etc were already blocked by nutch's default regex url filters). 3. Further had to control the 00152 site to only be crawled under its /maori/ sub-domain. Since the seedURL maori.html was not off a /maori/ url, this revealed that the CCWETProcessor code didn't already consider allowing filters to okay seedURLs even where the crawl was controlled to run over a subdomain (as expressed in conf/sites-too-big-to-exhaustively-crawl file) but where the seedURL didn't match these controlled regex filters. So now, in such cases, the CCWETProcessor adds seedURLs that don't match to the filters too (so we get just the single file of the seedURL pages) besides a filter on the requested subdomain, so we follow all pages linked by the seedURLs that match the subdomain expression. 4. Adding to_crawl.tar.gz to svn, the tarball of the sites to_crawl that I actually ran nutch over, of all the sites folders with their seedURL.txt and regex-urlfilter.txt files that the batchcrawl.sh runs over. This didn't use the latest version of the sites folder and blacklist/whitelist files generated by CCWETProcessor, since the latest version was regenerated after the final modifications to CCWETProcessor which was after crawling was finished. But to_crawl.tar.gz does have a manually modified 00152, wit the correct regex-urlfilter file and uses the newer regex-urlfilter.GS_TEMPLATE file that blocks mp3 files. 5. crawledNode6.tar.gz now contains the dump output for sites 00152 and 00332, which were crawled on node6 today (after which their processed dump.txt file results were added into MongoDB). 7. MoreReading/mongodb.txt now contains the results of some queries I ran against the total nutch-crawled data.

File size: 5.5 KB
Line 
1package org.greenstone.atea;
2
3import java.io.*;
4import java.net.InetAddress;
5import java.util.zip.GZIPInputStream;
6
7import com.maxmind.geoip.*; // for LookupService and Location
8
9import org.apache.log4j.Logger;
10
11public class Utility {
12 private static Logger logger = Logger.getLogger(org.greenstone.atea.Utility.class.getName());
13
14 // Run gunzip
15 // To avoid making this linux specific, use Java to unzip, instead of running gunzip as process
16 // https://www.mkyong.com/java/how-to-decompress-file-from-gzip-file/
17 public static boolean unzipFile(File inZipFile, File outFile) {
18
19 byte[] buffer = new byte[1024];
20
21 // try-with-resources will safely close streams/dispose resources on success or error and exceptions
22 try (
23 GZIPInputStream gzis = new GZIPInputStream(new FileInputStream(inZipFile));
24 FileOutputStream out = new FileOutputStream(outFile);
25 ) {
26 int len;
27 while ((len = gzis.read(buffer)) > 0) {
28 out.write(buffer, 0, len);
29 }
30
31 //gzis.close();
32 //out.close();
33
34 logger.debug("Unzipped " + inZipFile + " to " + outFile);
35
36 } catch(IOException ex) {
37 //error("Failed to unzip " + inZipFile);
38 System.err.println("Failed to unzip " + inZipFile);
39 ex.printStackTrace();
40 return false;
41 }
42
43 return true;
44 }
45
46 /**
47 * Attribution following below is as per https://dev.maxmind.com/geoip/geoip2/geolite2/
48 *
49 * This product includes GeoLite2 data created by MaxMind, available from
50 * <a href="https://www.maxmind.com">https://www.maxmind.com</a>.
51 *
52 * Usage:
53 * https://stackoverflow.com/questions/1415851/best-way-to-get-geo-location-in-java
54 * version I'm using: https://github.com/maxmind/geoip-api-java
55 * Newer version: https://maxmind.github.io/GeoIP2-java/
56 *
57 * @return 2 letter countrycode in uppercase or an exception
58 */
59 public static String getCountryCodeOfDomain(String domainWithProtocol, File geoLiteCityDatFile)
60 throws Exception
61 {
62 //int startIndex = domainWithProtocol.indexOf("//"); // http:// or https:// prefix
63 //startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
64 String domain = stripProtocolFromURL(domainWithProtocol); //domainWithProtocol.substring(startIndex);
65
66 // pass in the GeoLiteCity.dat file to be able to do the location lookup for domain's IP
67 LookupService cl = new LookupService(geoLiteCityDatFile, LookupService.GEOIP_MEMORY_CACHE);
68
69 // get IP for domain
70 InetAddress inetAddress = InetAddress.getByName(domain);
71 String ipAddress = inetAddress.getHostAddress();
72
73 // get location object for IP
74 Location location = cl.getLocation(ipAddress);
75
76 if(location == null) {
77 throw new Exception("@@@@ No location info in DB for: " + domain);
78 } else {
79 return location.countryCode;
80 }
81
82 }
83
84 public static String stripProtocolAndWWWFromURL(String url) {
85 url = stripProtocolFromURL(url);
86
87 if(url.startsWith("www.")) { // strip any "wwww." at start as well too
88 url = url.substring(4);
89 }
90
91 return url;
92 }
93
94 public static String stripProtocolFromURL(String url) {
95 int startIndex = url.indexOf("//"); // for http:// or https:// prefix
96 startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
97 return url.substring(startIndex);
98 }
99
100
101 /** Work out the 'domain' for a given url.
102 * This retains any www. or subdomain prefix.
103 */
104 public static String getDomainForURL(String url, boolean withProtocol) {
105 int startIndex = startIndex = url.indexOf("//"); // for http:// or https:// prefix
106 startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
107 // keep the protocol around in case param withProtocol=true
108 String protocol = (startIndex == -1) ? "" : url.substring(0, startIndex);
109
110 String domain = url.substring(startIndex);
111 int endIndex = domain.indexOf("/");
112 if(endIndex == -1) endIndex = domain.length();
113 domain = domain.substring(0, endIndex);
114
115 if(withProtocol) {
116 // now that we have the domain (everything to the first / when there is no protocol)
117 // can glue the protocol back on
118 domain = protocol + domain;
119 }
120
121 return domain;
122 }
123
124 public static boolean isDomainInCountry(String domainWithProtocol,
125 String countryCode, File geoLiteCityDatFile)
126 {
127 countryCode = countryCode.toUpperCase();
128
129 int startIndex = domainWithProtocol.indexOf("//"); // http:// or https:// prefix
130 startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
131 String domain = domainWithProtocol.substring(startIndex);
132
133 boolean result = false;
134 try {
135 // pass in the GeoLiteCity.dat file to be able to do the location lookup for domain's IP
136 LookupService cl = new LookupService(geoLiteCityDatFile, LookupService.GEOIP_MEMORY_CACHE);
137
138 // get IP for domain
139 InetAddress inetAddress = InetAddress.getByName(domain);
140 String ipAddress = inetAddress.getHostAddress();
141
142 // get location object for IP
143 Location location = cl.getLocation(ipAddress);
144
145 if(location != null) {
146 // compare country code with tld parameter
147 System.err.println("@@@@ Got country code: " + location.countryCode);
148 result = location.countryCode.equals(countryCode);
149 } else {
150 System.err.println("@@@@ No location info in DB for: " + domainWithProtocol);
151 }
152 } catch(Exception e) {
153 e.printStackTrace();
154 System.err.println("Could not check if domain " + domain + " was in country: " + countryCode);
155 } finally {
156 return result;
157 }
158 }
159}
Note: See TracBrowser for help on using the repository browser.