Changeset 33623 for gs3-extensions
- Timestamp:
- 2019-11-05T21:04:09+13:00 (4 years ago)
- Location:
- gs3-extensions/maori-lang-detection
- Files:
-
- 7 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/MoreReading/crawling-Nutch.txt
r33621 r33623 492 492 493 493 494 INSTALLATION MONGO-DB AND CLIENT 495 FROM: https://docs.mongodb.com/manual/tutorial/install-mongodb-on-ubuntu/ 496 wget -qO - https://www.mongodb.org/static/pgp/server-4.2.asc | sudo apt-key add - 497 echo "deb [ arch=amd64 ] https://repo.mongodb.org/apt/ubuntu xenial/mongodb-org/4.2 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-4.2.list 498 sudo apt-get update 499 sudo apt-get install -y mongodb-org 500 501 UNINSTALLING 502 https://www.anintegratedworld.com/uninstall-mongodb-in-ubuntu-via-command-line-in-3-easy-steps/ 494 503 495 504 -
gs3-extensions/maori-lang-detection/conf/config.properties
r33615 r33623 23 23 24 24 25 mongodb.user=admin 26 mongodb.pwd=pinky 27 #CHANGEME 28 25 mongodb.user=anupama 26 mongodb.pwd=chang3m3 27 # default mongodb port is 27017. Don't change the port unless you really have configured 28 # your mongodb server to listen at some other port 29 mongodb.port=27017 30 mongodb.host=mongodb.cms.waikato.ac.nz 31 mongodb.dbname=ateacrawldata -
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java
r33615 r33623 237 237 } 238 238 239 /** Work out the 'domain' for a given url.240 * This retains any www. or subdomain prefix.241 */242 public static String getDomainForURL(String url, boolean withProtocol) {243 int startIndex = startIndex = url.indexOf("//"); // for http:// or https:// prefix244 startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion245 // the keep the URL around in case param withProtocol=true246 String protocol = (startIndex == -1) ? "" : url.substring(0, startIndex);247 248 String domain = url.substring(startIndex);249 int endIndex = domain.indexOf("/");250 if(endIndex == -1) endIndex = domain.length();251 domain = domain.substring(0, endIndex);252 253 if(withProtocol) {254 // now that we have the domain (everything to the first / when there is no protocol)255 // can glue the protocol back on256 domain = protocol + domain;257 }258 259 return domain;260 }261 239 262 240 /** Utility function to help escape regex characters in URL to go into regex-urlfilter.txt */ … … 304 282 // work out domain. This retains any www. or subdomain prefix 305 283 // passing true to further also retain the http(s) protocol 306 domainWithProtocol = getDomainForURL(url, true);284 domainWithProtocol = Utility.getDomainForURL(url, true); 307 285 308 286 Set<String> urlsSet; … … 316 294 } 317 295 296 /* 318 297 // Dr Nichols said that a url that was located outside the country and 319 298 // which had /mi/ URLs was more likely to be an autotranslated (product) site. … … 322 301 // then add that domain (if not already added) and that url into a file 323 302 // for later manual inspection 324 if(!domainWithProtocol.endsWith(".nz") && (url.contains("/mi/") || url.endsWith("/mi"))) { 325 /* 303 if(!domainWithProtocol.endsWith(".nz") 304 && (url.contains("/mi/") || url.endsWith("/mi"))) { 305 326 306 if(!possibleProductDomains.contains(domainWithProtocol)) { 327 307 … … 345 325 if(!isInNZ) { 346 326 possibleProductDomains.add(domainWithProtocol); 347 // write both domain and a sample URL on that site out to file327 // write both domain and a sample seedURL on that site out to file 348 328 possibleProductSitesWriter.write(countryCode + " : " + domainWithProtocol + "\n"); 349 329 possibleProductSitesWriter.write("\t" + url + "\n"); 350 330 } 351 }*/ /*else { 352 // already wrote out domain to file at some point, write just the URL out to file 353 possibleProductSitesWriter.write("\t" + url + "\n"); 354 }*/ 355 } 331 } 332 //else { 333 // already wrote out domain to file at some point, write just the URL out to file 334 //possibleProductSitesWriter.write("\t" + url + "\n"); 335 //} 336 } 337 */ 356 338 } 357 339 } catch (IOException ioe) { … … 686 668 // if any portion of the URL contains the word "livejasmin", or even "jasmin" actually, 687 669 // then it's an adult site, so blacklist the entire domain if it wasn't already blacklisted 688 String domainWithoutProtocol = getDomainForURL(url, false); // remove protocol670 String domainWithoutProtocol = Utility.getDomainForURL(url, false); // remove protocol 689 671 if(!isBlackListed && url.contains("jasmin")) { 690 672 logger.warn("### Blacklisting additional domain (likely an adult site): " + domainWithoutProtocol); … … 821 803 public static void printUsage() { 822 804 System.err.println("Run this program as:"); 823 System.err.println("\tCCWetProcessor <path to 'ccrawl-data' folder> <output folder path>");805 System.err.println("\tCCWetProcessor <path to 'ccrawl-data' input folder> <output folder path>"); 824 806 } 825 807 -
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java
r33622 r33623 2 2 3 3 4 import com.mongodb.client.MongoCollection; 4 5 import com.mongodb.client.MongoDatabase; 5 6 import com.mongodb.MongoClient; 6 7 import com.mongodb.MongoCredential; 8 9 import org.bson.Document; 7 10 8 11 import java.io.BufferedReader; … … 23 26 * TO RUN: 24 27 * java -cp ".:../conf:../lib/*" org.greenstone.atea.MongoDBAccess 28 * 29 * Manually connecting to mongodb from client: 30 * mongo 'mongodb://mongodb.cms.waikato.ac.nz:27017' -u USERNAME -p 31 * Then after connecting with pwd, type: 32 * use DBNAME 33 * 34 * Or connect to mongodb and specify db in one statement: 35 * mongo 'mongodb://mongodb.cms.waikato.ac.nz:27017/DBNAME?authSource=admin' -u USERNAME -p 36 * 37 * Some links: 38 * - https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection 39 * - https://docs.mongodb.com/manual/reference/glossary/ (particularly "collection") 40 * - https://tecadmin.net/tutorial/mongodb/drop-collection/ 41 * IMPORTANT LINK: 42 * - https://www.mongodb.com/blog/post/6-rules-of-thumb-for-mongodb-schema-design-part-1 43 * 25 44 */ 26 45 public class MongoDBAccess { … … 28 47 private static Logger logger = Logger.getLogger(org.greenstone.atea.MongoDBAccess.class.getName()); 29 48 30 final staticString HOST = "localhost";31 final staticint PORT = 27017; // mongodb port32 final staticString PROPS_FILENAME = "config.properties";33 final staticString DB_NAME = "ateacrawldata";49 String HOST = "localhost"; 50 int PORT = 27017; // mongodb port 51 String PROPS_FILENAME = "config.properties"; 52 String DB_NAME = "ateacrawldata"; 34 53 35 54 private String USERNAME; … … 37 56 38 57 58 private MongoClient mongo = null; 59 private MongoDatabase database = null; 60 39 61 public MongoDBAccess() throws Exception { 40 62 boolean success = false; … … 55 77 if(USERNAME.equals("")) { 56 78 USERNAME = "root"; 57 logger.warn("WARNING: No sensible value for mongodb.user specified in " + PROPS_FILENAME + " defaulting to: " + USERNAME);79 logger.warn("WARNING: No sensible value for mongodb.user specified in " + PROPS_FILENAME + ". Attempting to use: " + USERNAME); 58 80 } 59 81 PASSWORD = props.getProperty("mongodb.pwd"); … … 66 88 throw new Exception("************ FATAL ERROR: Change DB password in properties file " + PROPS_FILENAME); 67 89 } 90 91 HOST = props.getProperty("mongodb.host", HOST); 92 String port = props.getProperty("mongodb.port", Integer.toString(PORT)); 93 PORT = Integer.parseInt(port); 94 DB_NAME = props.getProperty("mongodb.dbname", DB_NAME); 95 96 logger.info("Connecting to mongodb with:"); 97 logger.info(" - host: " + HOST); 98 logger.info(" - port: " + PORT); 99 logger.info(" - user: " + USERNAME); 100 logger.info(" - db name: " + DB_NAME); 68 101 } 69 102 70 103 /** 104 * Since we have only a single MongoClient, don't need to call close/disconnect on it as per 105 * https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection 106 */ 71 107 public void connectToDB() throws Exception { 72 108 // Creating a Mongo client 73 MongoClientmongo = new MongoClient( HOST, PORT );109 mongo = new MongoClient( HOST, PORT ); 74 110 75 111 // Creating Credentials … … 79 115 80 116 // Accessing the database 81 MongoDatabasedatabase = mongo.getDatabase(DB_NAME);82 //System.out.println("Credentials: "+ credential);117 database = mongo.getDatabase(DB_NAME); 118 logger.info("Credentials: "+ credential); 83 119 } 84 120 85 121 122 /* 123 public void insertDocument() { 124 MongoCollection<Document> collection = this.database.getCollection("sampleCollection"); 125 } 126 */ 127 128 // create collection (table in RDBMS) websites, create collection webpages 129 // webpages collection will have sentences embedded 130 86 131 public static void main(String args[]) { 87 132 try { 88 133 MongoDBAccess mongodbCon = new MongoDBAccess(); 89 //mongodbCon.connectToDB(); 134 mongodbCon.connectToDB(); 135 //mongodbCon.insertDocument(); 90 136 }catch(Exception e) { 91 137 e.printStackTrace(); -
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java
r33615 r33623 182 182 TextDumpPage firstPage = pages.get(0); 183 183 String url = firstPage.getPageURL(); 184 this.domainOfSite = CCWETProcessor.getDomainForURL(url, true);184 this.domainOfSite = Utility.getDomainForURL(url, true); 185 185 } 186 186 else { … … 248 248 249 249 page.addMRILanguageStatus(isMRI); 250 250 251 251 252 // Even if the entire page is not found to be overall in MÄori, 252 253 // let's still inspect the sentences of the page and count how many (if any) … … 281 282 webpageCSVPrinter.printRecord(WEBPAGE_COUNTER++, 282 283 SITE_COUNTER, /* alternative: this.siteID */ 283 url, isMRI, totalSentences, numSentencesInMRI); 284 url, 285 //"origCharEncoding", "modifiedTime", "fetchTime", 286 page.getOriginalCharEncoding(), 287 page.getModifiedTime(), 288 page.getFetchTime(), 289 isMRI, totalSentences, numSentencesInMRI); 284 290 285 291 // Write the sentences that are in te reo into the mri-sentences CSV file … … 393 399 "domainURL","totalPagesInSite", "numPagesInMRI", "numOtherPagesContainingMRI", 394 400 "nutchCrawlTimestamp", "crawlUnfinished", "redoCrawl"); 395 webpagesCSVPrinter.printRecord("webpageID", "websiteID", "URL", "isMRI", 396 "numSentences", "numSentencesInMRI"); 401 webpagesCSVPrinter.printRecord("webpageID", "websiteID", "URL", 402 "origCharEncoding", "modifiedTime", "fetchTime", 403 "isMRI", "numSentences", "numSentencesInMRI"); 397 404 mriSentencesCSVPrinter.printRecord("sentenceID", "webpageID", "sentence"); 398 405 … … 435 442 436 443 } catch(Exception e) { 437 // can get an exception when instantiating CCWETProcessor instance444 // can get an exception when instantiating NutchTextDumpProcessor instance 438 445 // or with CSV file 439 446 logger.error(e.getMessage(), e); -
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/TextDumpPage.java
r33615 r33623 84 84 String k = line.substring(0, endIndex); 85 85 String v = line.substring(endIndex+1); 86 if(k.startsWith("metadata")) { 87 k = k.substring("metadata".length()); 88 } 89 86 90 tuples.put(k.trim(), v.trim()); 87 91 } else { … … 134 138 } 135 139 140 /* Dr Nichols suggested storing timestamp and char encoding. Not sure which timestamp 141 or encoding he meant, but storing 2 of several timestamps and selecting 142 original character encoding (presumably the char encoding of the page) out of 2 143 pieces of char encoding metadata to store. */ 144 public String getModifiedTime() { 145 // is this the webpage's last mod time? 146 String time = tuples.get("modifiedTime"); 147 time = time.equals("0") ? "" : time; // zero will be assumed to be epoch, rather than unset 148 return time; 149 } 150 public String getFetchTime() { 151 // is this the nutch crawl time 152 String time = tuples.get("fetchTime"); 153 time = time.equals("0") ? "" : time; // zero will be assumed to be epoch, rather than unset 154 return time; 155 156 } 157 public String getOriginalCharEncoding() { 158 // is this the web page's char-encoding? 159 return tuples.get("OriginalCharEncoding"); 160 } 161 136 162 public String get(String key) { 137 163 return tuples.get(key); -
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/Utility.java
r33604 r33623 82 82 } 83 83 84 /** Work out the 'domain' for a given url. 85 * This retains any www. or subdomain prefix. 86 */ 87 public static String getDomainForURL(String url, boolean withProtocol) { 88 int startIndex = startIndex = url.indexOf("//"); // for http:// or https:// prefix 89 startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion 90 // the keep the URL around in case param withProtocol=true 91 String protocol = (startIndex == -1) ? "" : url.substring(0, startIndex); 92 93 String domain = url.substring(startIndex); 94 int endIndex = domain.indexOf("/"); 95 if(endIndex == -1) endIndex = domain.length(); 96 domain = domain.substring(0, endIndex); 97 98 if(withProtocol) { 99 // now that we have the domain (everything to the first / when there is no protocol) 100 // can glue the protocol back on 101 domain = protocol + domain; 102 } 103 104 return domain; 105 } 106 84 107 public static boolean isDomainInCountry(String domainWithProtocol, 85 108 String countryCode, File geoLiteCityDatFile)
Note:
See TracChangeset
for help on using the changeset viewer.