Changeset 33562 for gs3-extensions
- Timestamp:
- 2019-10-11T21:52:40+13:00 (5 years ago)
- Location:
- gs3-extensions/maori-lang-detection
- Files:
-
- 3 added
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/conf/sites-too-big-to-exhaustively-crawl.txt
r33561 r33562 13 13 # FORMAT OF THIS FILE'S CONTENTS: 14 14 # <topsite-base-url>,<value> 15 # where <value> can be empty or one of SUBDOMAIN-COPY, SINGLEPAGE, <url-form-without-protocol> 15 # where <value> can or is one of 16 # empty, SUBDOMAIN-COPY, FOLLOW-LINKS-WITHIN-TOPSITE, SINGLEPAGE, <url-form-without-protocol> 16 17 # 17 # - if value is empty: if seedurl contains topsite-base-url, the seedurl will go into the file18 # unprocessed-topsite-matches.txt and the site/page won't be crawled.18 # - if value is left empty: if seedurl contains topsite-base-url, the seedurl will go into the 19 # file unprocessed-topsite-matches.txt and the site/page won't be crawled. 19 20 # The user will be notified to inspect the file unprocessed-topsite-matches.txt. 20 21 # - SINGLEPAGE: if seedurl matches topsite-base-url, then only download the page at that seedurl. … … 45 46 # crawl to just mi.wikipedia.org. 46 47 # Remember to leave out any protocol <from url-form-without-protocol>. 47 48 # column 3: whether nutch should do fetch all or not 49 # column 4: number of crawl iterations 48 # 49 # TODO If useful: 50 # column 3: whether nutch should do fetch all or not 51 # column 4: number of crawl iterations 50 52 51 53 # docs.google.com is a special case: not all pages are public and any interlinking is likely to 52 # be intentional. But SUBDOMAIN-COPY does not work: as seedURL's domain becomes docs.google.com 53 # which, when combined with SUBDOMAIN-COPY, the Java code treats as a special case so that 54 # any seedURL on docs.google.com ends up pushed out into the "unprocessed....txt" text file. 55 #docs.google.com,SUBDOMAIN-COPY 54 # be intentional. Grab all linked pages, for link depth set with nutch's crawl, as long as the 55 # links are within the given topsite-base-url 56 56 docs.google.com,FOLLOW-LINKS-WITHIN-TOPSITE 57 57 58 # Just crawl a single page for these: 58 59 drive.google.com,SINGLEPAGE 59 60 forms.office.com,SINGLEPAGE … … 63 64 # Special case of yale.edu: its Rapa-Nui pages are on blacklist, but we want this page + its photos 64 65 # The page's containing folder is whitelisted in case the photos are there. 65 korora.econ.yale.edu, ,SINGLEPAGE66 korora.econ.yale.edu,SINGLEPAGE 66 67 67 68 000webhost.com -
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java
r33561 r33562 3 3 4 4 import java.io.*; 5 import java.nio.charset.StandardCharsets; 5 6 import java.util.Properties; 6 7 import java.util.zip.GZIPInputStream; … … 12 13 import java.util.TreeSet; 13 14 15 import org.apache.commons.csv.*; // https://commons.apache.org/proper/commons-csv/download_csv.cgi 14 16 import org.apache.log4j.Logger; 17 15 18 16 19 /** … … 132 135 133 136 private int wetFileCount = 0; 134 137 138 private static ClassLoader MY_CLASSLOADER = org.greenstone.atea.CCWETProcessor.class.getClassLoader(); 139 135 140 public CCWETProcessor(File inFolder, File outFolder) throws Exception { 136 141 this.commoncrawlDir = inFolder; … … 138 143 139 144 // load up the properties from the config file 140 try (InputStream infile = org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream("config.properties")) {145 try (InputStream infile = MY_CLASSLOADER.getResourceAsStream("config.properties")) { 141 146 configProperties = new Properties(); 142 147 configProperties.load(infile); … … 201 206 System.err.println("Loading map of topsites with regex of allowable url patterns for each topsite."); 202 207 topSitesMap = new HashMap<String, String>(); 203 //File topSitesFile = new File(outFolder, "sites-too-big-to-exhaustively-crawl.txt"); 204 208 209 // Read in our csv file of topsites and what to do when one hits a match with a seedURL 210 // and put these in our topSitesMap 211 // https://commons.apache.org/proper/commons-csv/apidocs/index.html 212 // https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVParser.html 213 //https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVFormat.html 214 CSVFormat customisedCSVFormat = CSVFormat.DEFAULT 215 .withCommentMarker('#') 216 .withSkipHeaderRecord() 217 .withIgnoreSurroundingSpaces(); 218 219 File topSitesCSVData = new File(MY_CLASSLOADER.getResource("sites-too-big-to-exhaustively-crawl.txt").getFile()); 220 // CSVParser is AutoCloseable and throws exceptions, so putting it in a try-with-resources 205 221 try ( 206 BufferedReader reader = new BufferedReader(new InputStreamReader(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream("sites-too-big-to-exhaustively-crawl.txt"), "UTF-8"));222 CSVParser parser = CSVParser.parse(topSitesCSVData, StandardCharsets.UTF_8, customisedCSVFormat); 207 223 ) { 208 209 String str = null; 210 while((str = reader.readLine()) != null) { 211 str = str.trim(); 212 if(str.equals("") || str.startsWith("#")) { 213 continue; 214 } 215 216 // comma separated list of values 217 int splitindex = str.indexOf(","); 218 if(splitindex == -1) { 219 topSitesMap.put(str, ""); 220 } else { 221 String topsite = str.substring(0, splitindex).trim(); 222 String allowed_url_pattern = str.substring(splitindex+1).trim(); 223 topSitesMap.put(topsite, allowed_url_pattern); 224 } 225 } 226 } catch (IOException ioe) { 227 ioe.printStackTrace(); 228 System.err.println("\n@@@@@@@@@ Error reading in from top sites file conf/sites-too-big-to-exhaustively-crawl.txt"); 229 } 230 224 for (CSVRecord csvRecord : parser) { 225 String topsite = csvRecord.get(0); 226 String allowed_url_pattern = (csvRecord.size() >= 2) ? csvRecord.get(1) : ""; 227 topSitesMap.put(topsite, allowed_url_pattern); 228 229 //System.err.println("@@@@ topsite: " + topsite + " - " + allowed_url_pattern); 230 231 } 232 } catch(Exception e) { 233 e.printStackTrace(); 234 System.err.println("\n@@@@@@@@@ Error attempting to parse CSV format of text file " + topSitesCSVData); 235 } 236 237 238 231 239 //System.err.println("Prematurely terminating for testing purposes."); 232 240 //System.exit(-1); … … 669 677 670 678 // if filterListFilename does not exist in the conf folder, just return 671 if( org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResource(filterListFilename) == null) {679 if(MY_CLASSLOADER.getResource(filterListFilename) == null) { 672 680 System.err.println(filterListFilename + " does not exist"); 673 681 return; … … 675 683 676 684 try ( 677 BufferedReader reader = new BufferedReader(new InputStreamReader( org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream(filterListFilename), "UTF-8"));685 BufferedReader reader = new BufferedReader(new InputStreamReader(MY_CLASSLOADER.getResourceAsStream(filterListFilename), "UTF-8")); 678 686 ) { 679 687 String filter = null; … … 842 850 843 851 File[] ccrawlFolders = commoncrawlDir.listFiles(new CCrawlWETFolderFilenameFilter()); 844 852 845 853 for(int i = 0; i < ccrawlFolders.length; i++) { 846 854 File ccrawlFolder = ccrawlFolders[i]; … … 848 856 ccWETFilesProcessor.processAllWETFilesOfCrawl(ccrawlFolder); 849 857 } 850 851 858 852 859 // create the global files of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls
Note:
See TracChangeset
for help on using the changeset viewer.