source: other-projects/maori-lang-detection/src/org/greenstone/atea/RandomURLsForDomainGenerator.java@ 33883

Last change on this file since 33883 was 33883, checked in by ak19, 4 years ago

Clarifications

File size: 3.3 KB
Line 
1package org.greenstone.atea;
2
3import java.util.*; /* includes Random */
4import java.io.*;
5
6import org.apache.log4j.Logger;
7
8/**
9* TO COMPILE OR RUN, FIRST DO:
10 * cd maori-lang-detection/apache-opennlp-1.9.1
11 * export OPENNLP_HOME=`pwd`
12 * cd maori-lang-detection/src
13 *
14 * TO COMPILE:
15 * maori-lang-detection/src$
16 * javac -cp ".:../conf:../lib/*" org/greenstone/atea/RandomURLsForDomainGenerator.java
17 *
18 * TO RUN:
19 * maori-lang-detection/src$
20 * java -cp ".:../conf:../lib/*" org/greenstone/atea/RandomURLsForDomainGenerator ../mongodb-data/domainsNZ_IsMRI.txt 255
21 *
22*/
23public class RandomURLsForDomainGenerator {
24 static Logger logger = Logger.getLogger(org.greenstone.atea.RandomURLsForDomainGenerator.class.getName());
25
26 private final MongoDBAccess mongodbAccess;
27 private final int numURLs;
28
29 //private String[] urls;
30 ArrayList<String> urlsList = new ArrayList<String>();
31
32 public RandomURLsForDomainGenerator(MongoDBAccess mongodbAccess,
33 File domainsFile,
34 int numURLs)
35 {
36 this.mongodbAccess = mongodbAccess;
37 this.numURLs = numURLs;
38
39 // 1. read each url from the domainsFile
40 // 1a. do the query
41 // 1b. add the arraylist result to urls
42
43 try (
44 BufferedReader reader = new BufferedReader(new FileReader(domainsFile));
45 ) {
46
47
48 String domain;
49
50 while((domain = reader.readLine()) != null) {
51 domain = domain.trim();
52 if(!domain.equals("")) {
53 ArrayList<String> moreURLs = mongodbAccess.queryAllMatchingIsMRIURLs(domain);
54 urlsList.addAll(moreURLs);
55 }
56 }
57 } catch(Exception e) {
58 logger.error("Unable to read URLs from file " + domainsFile.getAbsolutePath());
59 logger.error(e.getMessage(), e);
60 }
61
62 // Shuffle the urlsList, then write out the first numURLs into a file.
63 // https://stackoverflow.com/questions/5505927/how-to-generate-a-random-permutation-in-java
64 File parentFolder = domainsFile.getParentFile();
65 String fileName = domainsFile.getName();
66 File outFile = new File(parentFolder, "random"+numURLs+"_"+fileName);
67 //File fullSetOutFile = new File(parentFolder, "allPages_"+fileName);
68
69 // shuffle list and take the first n - write to file
70 try (
71 Writer writer = new BufferedWriter(new FileWriter(outFile));
72 ) {
73 Collections.shuffle(urlsList, new Random(1000));
74 for (int i=0; i<numURLs; i++) {
75 String url = urlsList.get(i);
76 //System.out.println(list.get(i));
77 writer.write(url + "\n");
78 }
79 } catch(Exception e) {
80 logger.error("Unable to write to file " + outFile.getAbsolutePath());
81 logger.error(e.getMessage(), e);
82 }
83
84 }
85
86 public static void printUsage() {
87 System.err.println("RandomURLsForDomainGenerator <domains.txt> <numURLs>");
88 }
89
90 public static void main(String args[]) {
91 if(args.length != 2) {
92 printUsage();
93 System.exit(-1);
94 }
95
96 try (
97 MongoDBAccess mongodb = new MongoDBAccess();
98 ) {
99 File domainsFile = new File(args[0]);
100 if(!domainsFile.exists()) {
101 System.err.println("File " + domainsFile + " does not exist");
102 System.exit(-1);
103 }
104
105 int genNumURLs = Integer.parseInt(args[1]);
106
107 mongodb.connectToDB();
108
109 RandomURLsForDomainGenerator gen =
110 new RandomURLsForDomainGenerator(mongodb, domainsFile, genNumURLs);
111
112 } catch(Exception e) {
113 logger.error(e.getMessage(), e);
114 }
115 }
116}
Note: See TracBrowser for help on using the repository browser.