package org.greenstone.atea; import java.util.*; import java.io.*; import org.apache.log4j.Logger; /** * TO COMPILE OR RUN, FIRST DO: * cd maori-lang-detection/apache-opennlp-1.9.1 * export OPENNLP_HOME=`pwd` * cd maori-lang-detection/src * * TO COMPILE: * maori-lang-detection/src$ * javac -cp ".:../conf:../lib/*" org/greenstone/atea/WebPageURLsListing.java * * TO RUN: * maori-lang-detection/src$ * java -cp ".:../conf:../lib/*" org/greenstone/atea/WebPageURLsListing ../mongodb-data/domainsNZ_IsMRI.txt 255 * */ public class WebPageURLsListing { static Logger logger = Logger.getLogger(org.greenstone.atea.WebPageURLsListing.class.getName()); private final MongoDBAccess mongodbAccess; private File outFolder; public WebPageURLsListing(MongoDBAccess mongodbAccess, File outFolder) { this.mongodbAccess = mongodbAccess; this.outFolder = outFolder; } public String produceURLsForPagesInMRI(File domainsFile) { return writeFile(MongoDBAccess.IS_MRI, domainsFile); } public String produceURLsForPagesContainingMRI(File domainsFile) { return writeFile(MongoDBAccess.CONTAINS_MRI, domainsFile); } public String writeFile(int filterType, File domainsFile/*, int numURLs*/) { ArrayList urlsList = new ArrayList(); // 1. read each url from the domainsFile // 1a. do the query // 1b. add the arraylist result to urls try ( BufferedReader reader = new BufferedReader(new FileReader(domainsFile)); ) { String domain; while((domain = reader.readLine()) != null) { domain = domain.trim(); if(!domain.equals("")) { ArrayList moreURLs = mongodbAccess.queryAllMatchingURLsFilteredBy(domain, filterType); urlsList.addAll(moreURLs); } } } catch(Exception e) { logger.error("Unable to read URLs from file " + domainsFile.getAbsolutePath()); logger.error(e.getMessage(), e); } // Shuffle the urlsList, then write out the first numURLs into a file. // https://stackoverflow.com/questions/5505927/how-to-generate-a-random-permutation-in-java File parentFolder = domainsFile.getParentFile(); //File outFile = new File(parentFolder, "random"+numURLs+"_"+domainsFile.getName()); String fileName = (filterType == MongoDBAccess.IS_MRI) ? "isMRI_" : "containsMRI_"; File outFile = new File(parentFolder, fileName+domainsFile.getName()); // write out ALL the URLs try ( Writer writer = new BufferedWriter(new FileWriter(outFile)); ) { for (int i=0; i < urlsList.size(); i++) { String url = urlsList.get(i); //System.out.println(list.get(i)); writer.write(url + "\n"); } } catch(Exception e) { logger.error("Unable to write to file " + outFile.getAbsolutePath()); logger.error(e.getMessage(), e); } /* // shuffle list and take the first n - write to file try ( Writer writer = new BufferedWriter(new FileWriter(outFile)); ) { Collections.shuffle(urlsList); for (int i=0; i= 2) { printUsage(); System.exit(-1); } try ( MongoDBAccess mongodb = new MongoDBAccess(); ) { mongodb.connectToDB(); // output files will be stored in mongodb-data-auto File outFolder = new File("../mongodb-data-auto/").getAbsoluteFile(); WebPageURLsListing listing = new WebPageURLsListing(mongodb, outFolder); if(args.length >= 1) { File domainsFile = new File(args[0]); if(!domainsFile.exists()) { System.err.println("File " + domainsFile + " does not exist"); System.exit(-1); } //int genNumURLs = Integer.parseInt(args[1]); //String isMRIFile = listing.produceURLsForPagesInMRI(domainsFile); //String containsMRIFile = listing.produceURLsForPagesContainingMRI(domainsFile); //listing.writeWebPagesOfAllNZSitesAndDomainListing(); } else { String filename = listing.writeTentativeNonAutotranslatedSites(); filename = listing.writeOverseasSitesWithMiInURLPath(); } } catch(Exception e) { logger.error(e.getMessage(), e); } } }