source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/Utility.java@ 33623

Last change on this file since 33623 was 33623, checked in by ak19, 4 years ago
  1. Incorporated Dr Nichols earlier suggestion of storing page modified time and char-encoding metadata if present in the crawl dump output. Have done so, but neither modifiedTime nor fetchTime metadata of the dump file appear to be a webpage's actual modified time, as they're from 2019 and set around the period we've been crawling. 2. Moved getDomainFromURL() function from CCWETProcessor.java to Utility.java since it's been reused. 3. MongoDBAccess class successfully connects (at least, no exceptions) and uses the newly added properties in config.properties to make the connection.
File size: 5.0 KB
Line 
1package org.greenstone.atea;
2
3import java.io.*;
4import java.net.InetAddress;
5import java.util.zip.GZIPInputStream;
6
7import com.maxmind.geoip.*; // for LookupService and Location
8
9import org.apache.log4j.Logger;
10
11public class Utility {
12 private static Logger logger = Logger.getLogger(org.greenstone.atea.Utility.class.getName());
13
14 // Run gunzip
15 // To avoid making this linux specific, use Java to unzip, instead of running gunzip as process
16 // https://www.mkyong.com/java/how-to-decompress-file-from-gzip-file/
17 public static boolean unzipFile(File inZipFile, File outFile) {
18
19 byte[] buffer = new byte[1024];
20
21 // try-with-resources will safely close streams/dispose resources on success or error and exceptions
22 try (
23 GZIPInputStream gzis = new GZIPInputStream(new FileInputStream(inZipFile));
24 FileOutputStream out = new FileOutputStream(outFile);
25 ) {
26 int len;
27 while ((len = gzis.read(buffer)) > 0) {
28 out.write(buffer, 0, len);
29 }
30
31 //gzis.close();
32 //out.close();
33
34 logger.debug("Unzipped " + inZipFile + " to " + outFile);
35
36 } catch(IOException ex) {
37 //error("Failed to unzip " + inZipFile);
38 System.err.println("Failed to unzip " + inZipFile);
39 ex.printStackTrace();
40 return false;
41 }
42
43 return true;
44 }
45
46 /**
47 * Attribution following below is as per https://dev.maxmind.com/geoip/geoip2/geolite2/
48 *
49 * This product includes GeoLite2 data created by MaxMind, available from
50 * <a href="https://www.maxmind.com">https://www.maxmind.com</a>.
51 *
52 * Usage:
53 * https://stackoverflow.com/questions/1415851/best-way-to-get-geo-location-in-java
54 * version I'm using: https://github.com/maxmind/geoip-api-java
55 * Newer version: https://maxmind.github.io/GeoIP2-java/
56 *
57 * @return 2 letter countrycode in uppercase or an exception
58 */
59 public static String getCountryCodeOfDomain(String domainWithProtocol, File geoLiteCityDatFile)
60 throws Exception
61 {
62 int startIndex = domainWithProtocol.indexOf("//"); // http:// or https:// prefix
63 startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
64 String domain = domainWithProtocol.substring(startIndex);
65
66 // pass in the GeoLiteCity.dat file to be able to do the location lookup for domain's IP
67 LookupService cl = new LookupService(geoLiteCityDatFile, LookupService.GEOIP_MEMORY_CACHE);
68
69 // get IP for domain
70 InetAddress inetAddress = InetAddress.getByName(domain);
71 String ipAddress = inetAddress.getHostAddress();
72
73 // get location object for IP
74 Location location = cl.getLocation(ipAddress);
75
76 if(location == null) {
77 throw new Exception("@@@@ No location info in DB for: " + domain);
78 } else {
79 return location.countryCode;
80 }
81
82 }
83
84 /** Work out the 'domain' for a given url.
85 * This retains any www. or subdomain prefix.
86 */
87 public static String getDomainForURL(String url, boolean withProtocol) {
88 int startIndex = startIndex = url.indexOf("//"); // for http:// or https:// prefix
89 startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
90 // the keep the URL around in case param withProtocol=true
91 String protocol = (startIndex == -1) ? "" : url.substring(0, startIndex);
92
93 String domain = url.substring(startIndex);
94 int endIndex = domain.indexOf("/");
95 if(endIndex == -1) endIndex = domain.length();
96 domain = domain.substring(0, endIndex);
97
98 if(withProtocol) {
99 // now that we have the domain (everything to the first / when there is no protocol)
100 // can glue the protocol back on
101 domain = protocol + domain;
102 }
103
104 return domain;
105 }
106
107 public static boolean isDomainInCountry(String domainWithProtocol,
108 String countryCode, File geoLiteCityDatFile)
109 {
110 countryCode = countryCode.toUpperCase();
111
112 int startIndex = domainWithProtocol.indexOf("//"); // http:// or https:// prefix
113 startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
114 String domain = domainWithProtocol.substring(startIndex);
115
116 boolean result = false;
117 try {
118 // pass in the GeoLiteCity.dat file to be able to do the location lookup for domain's IP
119 LookupService cl = new LookupService(geoLiteCityDatFile, LookupService.GEOIP_MEMORY_CACHE);
120
121 // get IP for domain
122 InetAddress inetAddress = InetAddress.getByName(domain);
123 String ipAddress = inetAddress.getHostAddress();
124
125 // get location object for IP
126 Location location = cl.getLocation(ipAddress);
127
128 if(location != null) {
129 // compare country code with tld parameter
130 System.err.println("@@@@ Got country code: " + location.countryCode);
131 result = location.countryCode.equals(countryCode);
132 } else {
133 System.err.println("@@@@ No location info in DB for: " + domainWithProtocol);
134 }
135 } catch(Exception e) {
136 e.printStackTrace();
137 System.err.println("Could not check if domain " + domain + " was in country: " + countryCode);
138 } finally {
139 return result;
140 }
141 }
142}
Note: See TracBrowser for help on using the repository browser.