Context Navigation

← Previous Change
Next Change →

Changeset 33488 for gs3-extensions

Timestamp:

2019-09-17T14:48:36+12:00 (5 years ago)

Author:

ak19

Message:

new function createSeedURLsFiles() in WETProcessor that replaces the bash script bin/script/unique_mri_domains_from_cc.sh

Location:

gs3-extensions/maori-lang-detection

Files:

: 2 edited

bin/script/unique_mri_domains_from_cc.sh (modified) (1 diff)
src/org/greenstone/atea/WETProcessor.java (modified) (3 diffs)

Legend:

: Unmodified
: Added
: Removed

gs3-extensions/maori-lang-detection/bin/script/unique_mri_domains_from_cc.sh

r33471	r33488
69	69
70	70
71		done < <(cat sorted_MRI_urls.txt)
	71	done < <(cat sorted_MRI_urls.txt)
72	72
73	73	echo "**************************************"

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java

-              r33480
+              r33488
 import java.util.Properties;
 import java.util.zip.GZIPInputStream;
+import java.util.Iterator;
+import java.util.Set;
+import java.util.TreeSet;
 import org.apache.log4j.Logger;
 …
+    }
+    }
+    /**
+     * Takes as input the keepURLs.txt file generated by running WETProcessor instances.
+     * As output produces the URL seed list and regex-urlfilter text files required by nutch,
+     * https://cwiki.apache.org/confluence/display/nutch/NutchTutorial
+     */
+    public static void createSeedURLsFiles(File urlsFile, File seedURLsFile, File urlFilterFile) {
+    // Maintain Sets of unique domains and urls
+    // TreeSet: by default, "the elements are ordered using their natural ordering"
+    // (or by a Comparator provided at set creation time).
+    // Whereas HashSet doesn't guarantee ordering.
+    // So we get alphabetic sorting for free. And guaranteed log(n) for basic operations.
+    Set<String> domainsSet = new TreeSet<String>();
+    Set<String> urlsSet = new TreeSet<String>();
+    final String FILTER_REGEX_PREFIX = "+https?://([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)*
+    try (
+         BufferedReader reader = new BufferedReader(new FileReader(urlsFile));
+         ) {
+        // read a URL at a time from urlsFile
+        String url = null;
+        String domain = null;
+        while((url = reader.readLine()) != null) { // readLine removes newline separator
+        // work out domain. This retains any www. or subdomain prefix:
+        int startIndex = url.indexOf("//"); // http:// or https:// prefix
+        startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
+        domain = url.substring(startIndex);
+        int endIndex = domain.indexOf("/");
+        if(endIndex == -1) endIndex = domain.length();
+        domain = domain.substring(0, endIndex);
+        //if(!domainsMap.containsKey(domain)) {
+        urlsSet.add(url);
+        domainsSet.add(domain);
+        //}
+        }
+    } catch (IOException ioe) {
+        ioe.printStackTrace();
+        System.err.println("\n@@@@@@@@@ Error reading in urls from file " + urlsFile);
+    }
+    try (BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile))) {
+        Iterator<String> i = urlsSet.iterator();
+        while(i.hasNext()) {
+        String url = i.next();
+        seedURLsWriter.write(url + "\n");
+        }
+    } catch (IOException ioe) {
+        ioe.printStackTrace();
+        System.err.println("\n@@@@@@@@@ Error writing to either " + seedURLsFile + " or " + urlFilterFile);
+    }
+    try (BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile))) {
+        Iterator<String> i = domainsSet.iterator();
+        // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/
+        while(i.hasNext()) {
+        String domain = i.next();
+        domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";
+        urlFilterWriter.write(domain + "\n");
+        }
+    } catch (IOException ioe) {
+        ioe.printStackTrace();
+        System.err.println("\n@@@@@@@@@ Error writing to either " + seedURLsFile + " or " + urlFilterFile);
+    }
+    }
     //public static int getRecordCount() { return recordCount; }
 …
+    }
+    File seedURLsFile = new File(outFolder, "seedURLs.txt");
+    File urlFilterFile = new File(outFolder, "regex-urlfilter.txt");
+    WETProcessor.createSeedURLsFiles(WETProcessor.keepURLsFile, seedURLsFile, urlFilterFile);
     return;

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 33488 for gs3-extensions

Legend:

gs3-extensions/maori-lang-detection/bin/script/unique_mri_domains_from_cc.sh

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java

Download in other formats: