source: other-projects/maori-lang-detection/src/org/greenstone/atea/WebPageURLsListing.java@ 33879

Last change on this file since 33879 was 33879, checked in by ak19, 4 years ago

Have the 2 mongodb aggregate() calls working that

File size: 4.5 KB
Line 
1package org.greenstone.atea;
2
3import java.util.*;
4import java.io.*;
5
6import org.apache.log4j.Logger;
7
8/**
9* TO COMPILE OR RUN, FIRST DO:
10 * cd maori-lang-detection/apache-opennlp-1.9.1
11 * export OPENNLP_HOME=`pwd`
12 * cd maori-lang-detection/src
13 *
14 * TO COMPILE:
15 * maori-lang-detection/src$
16 * javac -cp ".:../conf:../lib/*" org/greenstone/atea/WebPageURLsListing.java
17 *
18 * TO RUN:
19 * maori-lang-detection/src$
20 * java -cp ".:../conf:../lib/*" org/greenstone/atea/WebPageURLsListing ../mongodb-data/domainsNZ_IsMRI.txt 255
21 *
22*/
23public class WebPageURLsListing {
24 static Logger logger = Logger.getLogger(org.greenstone.atea.WebPageURLsListing.class.getName());
25
26 private final MongoDBAccess mongodbAccess;
27 private int numURLs;
28 private File domainsFile;
29
30 public WebPageURLsListing(MongoDBAccess mongodbAccess,
31 File domainsFile)
32 {
33 this.mongodbAccess = mongodbAccess;
34 this.domainsFile = domainsFile;
35 }
36
37 public WebPageURLsListing(MongoDBAccess mongodbAccess,
38 File domainsFile,
39 int numURLs)
40 {
41 this(mongodbAccess, domainsFile);
42 this.numURLs = numURLs;
43 }
44
45 public String produceURLsForPagesInMRI() {
46 return writeFile(MongoDBAccess.IS_MRI);
47 }
48
49 public String produceURLsForPagesContainingMRI() {
50 return writeFile(MongoDBAccess.CONTAINS_MRI);
51 }
52
53
54 public String writeFile(int filterType) {
55
56 ArrayList<String> urlsList = new ArrayList<String>();
57
58 // 1. read each url from the domainsFile
59 // 1a. do the query
60 // 1b. add the arraylist result to urls
61
62 try (
63 BufferedReader reader = new BufferedReader(new FileReader(domainsFile));
64 ) {
65
66
67 String domain;
68
69 while((domain = reader.readLine()) != null) {
70 domain = domain.trim();
71 if(!domain.equals("")) {
72 ArrayList<String> moreURLs = mongodbAccess.queryAllMatchingURLsFilteredBy(domain, filterType);
73 urlsList.addAll(moreURLs);
74 }
75 }
76 } catch(Exception e) {
77 logger.error("Unable to read URLs from file " + domainsFile.getAbsolutePath());
78 logger.error(e.getMessage(), e);
79 }
80
81 // Shuffle the urlsList, then write out the first numURLs into a file.
82 // https://stackoverflow.com/questions/5505927/how-to-generate-a-random-permutation-in-java
83 File parentFolder = domainsFile.getParentFile();
84 //File outFile = new File(parentFolder, "random"+numURLs+"_"+domainsFile.getName());
85 String fileName = (filterType == MongoDBAccess.IS_MRI) ? "isMRI_" : "containsMRI_";
86 File outFile = new File(parentFolder, fileName+domainsFile.getName());
87
88 // write out ALL the URLs
89 try (
90 Writer writer = new BufferedWriter(new FileWriter(outFile));
91 ) {
92
93 for (int i=0; i < urlsList.size(); i++) {
94 String url = urlsList.get(i);
95 //System.out.println(list.get(i));
96 writer.write(url + "\n");
97 }
98 } catch(Exception e) {
99 logger.error("Unable to write to file " + outFile.getAbsolutePath());
100 logger.error(e.getMessage(), e);
101 }
102
103 /*
104 // shuffle list and take the first n - write to file
105 try (
106 Writer writer = new BufferedWriter(new FileWriter(outFile));
107 ) {
108 Collections.shuffle(urlsList);
109 for (int i=0; i<numURLs; i++) {
110 String url = urlsList.get(i);
111 //System.out.println(list.get(i));
112 writer.write(url + "\n");
113 }
114 } catch(Exception e) {
115 logger.error("Unable to write to file " + outFile.getAbsolutePath());
116 logger.error(e.getMessage(), e);
117 }
118 */
119
120 return outFile.getAbsolutePath();
121 }
122
123
124 public static void printUsage() {
125 System.err.println("WebPageURLsListing [domains.txt]");
126 }
127
128 // Depending on args, generates isMRI and containsMRI file listings for:
129 // 1. UNFILTERED: all (NZ + overseas) AND takes manually curated domain list file for overseas and adds all NZ
130
131
132 public static void main(String args[]) {
133 if(args.length >= 2) {
134 printUsage();
135 System.exit(-1);
136 }
137
138 try (
139 MongoDBAccess mongodb = new MongoDBAccess();
140 ) {
141 File domainsFile = new File(args[0]);
142 if(!domainsFile.exists()) {
143 System.err.println("File " + domainsFile + " does not exist");
144 System.exit(-1);
145 }
146
147 //int genNumURLs = Integer.parseInt(args[1]);
148
149 mongodb.connectToDB();
150
151 WebPageURLsListing listing = new WebPageURLsListing(mongodb, domainsFile);
152 //String isMRIFile = listing.produceURLsForPagesInMRI();
153 //String containsMRIFile = listing.produceURLsForPagesContainingMRI();
154 mongodb.aggregateContainsMRIForNZ();
155 mongodb.aggregateContainsMRIForOverseas();
156
157 } catch(Exception e) {
158 logger.error(e.getMessage(), e);
159 }
160 }
161}
Note: See TracBrowser for help on using the repository browser.