Context Navigation

← Previous Changeset
Next Changeset →

Changeset 33518

Timestamp:

2019-09-24T21:13:47+12:00 (5 years ago)

Author:

ak19

Message:

Intermediate commit: got the seed urls file temporarily written out as domain followed by commoncrawl's urls within the domain. For the next commit, I will try splitting them into individual files per domain along with their individual regex-url txt file list restricted just to the site/domain, while returning what's output into the seed urls file back to all urls, sorted.

File:

: 1 edited

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java (modified) (6 diffs)

Legend:

: Unmodified
: Added
: Removed

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

-              r33517
+              r33518
 import java.util.Map;
 import java.util.Set;
+import java.util.TreeMap;
 import java.util.TreeSet;
 …
     /** Map of domains we keep and the full urls we're keeping that are of that domain.
+     * Choosing a TreeMap to preserve natural (alphabetical) ordering of keys,
+     * since a HashMap has no notion of ordering.
+     */
+    private TreeMap<String, TreeSet<String>> domainsToURLsMap;
+     * No need to use a TreeMap which preserves natural (alphabetical) ordering of keys,
+     * while a HashMap has no notion of ordering, because we just need to store urls with
+     * their domains. Whether the domains are sorted or the urls per domain are sorted becomes
+     * irrelevant. (Does it really? What if we have urls followed vs preceded by urls with the
+     * same prefix, e.g. pinky.com/toto/index.html and pinky.com/toto/nono/file.html
+     * Is there any benefit to nutch when crawling if these seedURLs are ordered or not?)
+     */
+    private Map<String, Set<String>> domainsToURLsMap;
     // Keep a count of all the records that all WETProcessors instantiated
 …
     // So we get alphabetic sorting for free. And guaranteed log(n) for basic operations.
+    Set<String> domainsSet = new TreeSet<String>();
+    Set<String> urlsSet = new TreeSet<String>();
+    //Set<String> domainsSet = new TreeSet<String>();
+    //Set<String> urlsSet = new TreeSet<String>();
+    domainsToURLsMap = new TreeMap<String, Set<String>>();
     final String FILTER_REGEX_PREFIX = "+https?://([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)*
 …
         domain = domain.substring(0, endIndex);
+        //if(!domainsMap.containsKey(domain)) {
+        urlsSet.add(url);
+        domainsSet.add(domain);
+        //}
+        //urlsSet.add(url);
+        //domainsSet.add(domain);
+        Set<String> urlsSet;
+        if(!domainsToURLsMap.containsKey(domain)) {
+            urlsSet = new TreeSet<String>();
+            urlsSet.add(url);
+            domainsToURLsMap.put(domain, urlsSet);
+        } else {
+            urlsSet = domainsToURLsMap.get(domain);
+            urlsSet.add(url);
+        }
+        }
     } catch (IOException ioe) {
 …
         System.err.println("\n@@@@@@@@@ Error reading in urls from file " + this.keepURLsFile);
+    }
+    /*
     try (BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile))) {
         Iterator<String> i = urlsSet.iterator();
 …
         System.err.println("\n@@@@@@@@@ Error writing to " + seedURLsFile);
+    }
+    */
+    // write out each domain followed in sequence by all urls we found in that domain
+    // (urls with tab up front)
+    try (BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile))) {
+        //Set<Map.Entry<String, Set<String>>> domainsSet = domainsToURLsMap.keySet();
+        Set<String> domainsSet = domainsToURLsMap.keySet();
+        Iterator<String> domainIterator = domainsSet.iterator();
+        while(domainIterator.hasNext()) {
+        // write out the domain
+        String domain = domainIterator.next();
+        seedURLsWriter.write(domain + "\n");
+        // next write out the urls for the domain with a tab prefixed to each
+        Set<String> urlsForDomainSet = domainsToURLsMap.get(domain);
+        Iterator<String> urlIterator = urlsForDomainSet.iterator();
+        while(urlIterator.hasNext()) {
+            String url = urlIterator.next();
+            seedURLsWriter.write("\t" + url + "\n");
+        }
+        }
+    } catch (IOException ioe) {
+        ioe.printStackTrace();
+        System.err.println("\n@@@@@@@@@ Error writing to " + urlFilterFile);
+    }
+    // write out domains as regular expressions into "regex-urlfilter.txt" file
     try (BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile))) {
+        Set<String> domainsSet = domainsToURLsMap.keySet();
         Iterator<String> i = domainsSet.iterator();
         // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 33518

Legend:

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

Download in other formats: