source: other-projects/maori-lang-detection/src/org/greenstone/atea/WebPageURLsListing.java@ 33873

Last change on this file since 33873 was 33873, checked in by ak19, 4 years ago

Beginnings of WebPageURLsListing program whose purpose Dr Bainbridge described today.

File size: 3.8 KB
Line 
1package org.greenstone.atea;
2
3import java.util.*;
4import java.io.*;
5
6import org.apache.log4j.Logger;
7
8/**
9* TO COMPILE OR RUN, FIRST DO:
10 * cd maori-lang-detection/apache-opennlp-1.9.1
11 * export OPENNLP_HOME=`pwd`
12 * cd maori-lang-detection/src
13 *
14 * TO COMPILE:
15 * maori-lang-detection/src$
16 * javac -cp ".:../conf:../lib/*" org/greenstone/atea/WebPageURLsListing.java
17 *
18 * TO RUN:
19 * maori-lang-detection/src$
20 * java -cp ".:../conf:../lib/*" org/greenstone/atea/WebPageURLsListing ../mongodb-data/domainsNZ_IsMRI.txt 255
21 *
22*/
23public class WebPageURLsListing {
24 static Logger logger = Logger.getLogger(org.greenstone.atea.WebPageURLsListing.class.getName());
25
26 private final MongoDBAccess mongodbAccess;
27 private final int numURLs;
28
29 //private String[] urls;
30 ArrayList<String> urlsList = new ArrayList<String>();
31
32 public WebPageURLsListing(MongoDBAccess mongodbAccess,
33 File domainsFile,
34 int numURLs)
35 {
36 this.mongodbAccess = mongodbAccess;
37 this.numURLs = numURLs;
38
39 // 1. read each url from the domainsFile
40 // 1a. do the query
41 // 1b. add the arraylist result to urls
42
43 try (
44 BufferedReader reader = new BufferedReader(new FileReader(domainsFile));
45 ) {
46
47
48 String domain;
49
50 while((domain = reader.readLine()) != null) {
51 domain = domain.trim();
52 if(!domain.equals("")) {
53 ArrayList<String> moreURLs = mongodbAccess.queryAllMatchingIsMRIURLs(domain);
54 urlsList.addAll(moreURLs);
55 }
56 }
57 } catch(Exception e) {
58 logger.error("Unable to read URLs from file " + domainsFile.getAbsolutePath());
59 logger.error(e.getMessage(), e);
60 }
61
62 // Shuffle the urlsList, then write out the first numURLs into a file.
63 // https://stackoverflow.com/questions/5505927/how-to-generate-a-random-permutation-in-java
64 File parentFolder = domainsFile.getParentFile();
65 String fileName = domainsFile.getName();
66 //File outFile = new File(parentFolder, "random"+numURLs+"_"+fileName);
67 File fullSetOutFile = new File(parentFolder, "allPages_"+fileName);
68
69 // write out ALL the URLs
70 try (
71 Writer writer = new BufferedWriter(new FileWriter(fullSetOutFile));
72 ) {
73
74 for (int i=0; i < urlsList.size(); i++) {
75 String url = urlsList.get(i);
76 //System.out.println(list.get(i));
77 writer.write(url + "\n");
78 }
79 } catch(Exception e) {
80 logger.error("Unable to write to file " + fullSetOutFile.getAbsolutePath());
81 logger.error(e.getMessage(), e);
82 }
83
84 /*
85 // shuffle list and take the first n - write to file
86 try (
87 Writer writer = new BufferedWriter(new FileWriter(outFile));
88 ) {
89 Collections.shuffle(urlsList);
90 for (int i=0; i<numURLs; i++) {
91 String url = urlsList.get(i);
92 //System.out.println(list.get(i));
93 writer.write(url + "\n");
94 }
95 } catch(Exception e) {
96 logger.error("Unable to write to file " + outFile.getAbsolutePath());
97 logger.error(e.getMessage(), e);
98 }
99 */
100 }
101
102 public static void printUsage() {
103 System.err.println("WebPageURLsListing [domains.txt]");
104 }
105
106 // Depending on args, generates isMRI and containsMRI file listings for:
107 // 1. UNFILTERED: all (NZ + overseas) AND takes manually curated domain list file for overseas and adds all NZ
108
109
110
111 public static void main(String args[]) {
112 if(args.length >= 2) {
113 printUsage();
114 System.exit(-1);
115 }
116
117 try (
118 MongoDBAccess mongodb = new MongoDBAccess();
119 ) {
120 File domainsFile = new File(args[0]);
121 if(!domainsFile.exists()) {
122 System.err.println("File " + domainsFile + " does not exist");
123 System.exit(-1);
124 }
125
126 int genNumURLs = Integer.parseInt(args[1]);
127
128 mongodb.connectToDB();
129
130 WebPageURLsListing listing = new WebPageURLsListing(mongodb, domainsFile);
131
132 } catch(Exception e) {
133 logger.error(e.getMessage(), e);
134 }
135 }
136}
Note: See TracBrowser for help on using the repository browser.