- Timestamp:
- 2019-09-24T20:30:40+12:00 (5 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java
r33515 r33517 77 77 public final File greyListedFile; 78 78 79 /** Possible values stored in the blackList/whiteList/greyList Maps */ 79 80 private final Integer LIST_ENTRY_CONTAINS = new Integer(0); 80 81 private final Integer LIST_ENTRY_STARTSWITH = new Integer(1); 81 82 private final Integer LIST_ENTRY_ENDSWITH = new Integer(2); 82 83 private final Integer LIST_ENTRY_MATCHES = new Integer(3); 83 84 85 /** 86 * Store url patterns as keys and values indicated whether a url should 87 * match it exactly, start/end with it, or contain it 88 */ 84 89 private HashMap<String, Integer> blackList; 85 90 private HashMap<String, Integer> greyList; 86 91 private HashMap<String, Integer> whiteList; 87 92 93 /** Map of domains we keep and the full urls we're keeping that are of that domain. 94 * Choosing a TreeMap to preserve natural (alphabetical) ordering of keys, 95 * since a HashMap has no notion of ordering. 96 */ 97 private TreeMap<String, TreeSet<String>> domainsToURLsMap; 98 88 99 // Keep a count of all the records that all WETProcessors instantiated 89 100 // by our main method combined have processed … … 144 155 } 145 156 157 // prepare our blacklist, greylist (for inspection) and whitelist 146 158 System.err.println("Loading blacklist."); 147 159 blackList = new HashMap<String, Integer>(); 148 160 initURLFilterList(blackList, "url-blacklist-filter.txt"); 161 149 162 System.err.println("Loading greylist."); 150 163 greyList = new HashMap<String, Integer>(); 151 164 initURLFilterList(greyList, "url-greylist-filter.txt"); 165 152 166 System.err.println("Loading whitelist."); 153 167 whiteList = new HashMap<String, Integer>(); … … 159 173 160 174 /** 161 * Takes as input the keepURLs.txt file generated by running WETProcessor instances.162 * As output produces the URL seed list and regex-urlfilter text files required by nutch,175 * Using the keepURLs.txt file generated by running WETProcessor instances, produces 176 * as output the URL seed list and regex-urlfilter text files required by nutch, see 163 177 * https://cwiki.apache.org/confluence/display/nutch/NutchTutorial 164 178 */ … … 448 462 File urlFilterFile = new File(outFolder, "regex-urlfilter.txt"); 449 463 ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile); 464 465 System.out.println("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n"); 466 450 467 } catch(Exception e) { 451 468 // can get an exception when instantiating CCWETProcessor instance
Note:
See TracChangeset
for help on using the changeset viewer.