Context Navigation

← Previous Changeset
Next Changeset →

Changeset 33963

Timestamp:

2020-02-20T22:12:43+13:00 (4 years ago)

Author:

ak19

Message:

Added a new helper method to MongoDBQueryer.java to add numPagesInMRI and totalPages totals columns into the csv file

Location:

other-projects/maori-lang-detection/src/org/greenstone/atea

Files:

: 2 edited

ManualURLInspection.java (modified) (5 diffs)
MongoDBQueryer.java (modified) (3 diffs)

Legend:

: Unmodified
: Added
: Removed

other-projects/maori-lang-detection/src/org/greenstone/atea/ManualURLInspection.java

-              r33961
+              r33963
     public static final int IS_REALLY_IN_MRI_COLUMN = 2;
     public static final int QUALITY_LEVEL_COLUMN = 3;
+    public static final int COUNT_OF_PAGES_IN_MRI_COLUMN = 4; // count as detected by OpenNLP
+    public static final int TOTAL_PAGES_IN_SITE_COLUMN = 5;
     /** Possible values for the Quality Level column of the csv file */
 …
         if(csvRecord.isSet(QUALITY_LEVEL_COLUMN)) {
             qualityLevel = csvRecord.get(QUALITY_LEVEL_COLUMN);
-            /*
-            qualityLevel = qualityLevel.toUpperCase();
-            if(qualityLevel.equals("N")) {
-            qualityLevel = NAV;
-            } else if(qualityLevel.equals("L")) {
-            qualityLevel = LITTLE_TEXT;
-            } else if(qualityLevel.equals("M")) {
-            qualityLevel = MIXED_TEXT;
-            } else if(qualityLevel.equals("P")) {
-            qualityLevel = MAORI_PARAGRAPHS;
-            } else if(qualityLevel.equals("S")) {
-            qualityLevel = SIGNIFICANTLY_MAORI;
-            } else if(qualityLevel.equals("W")) {
-            qualityLevel = WORDS;
-            } else if(qualityLevel.equals("O")) {
-            qualityLevel = OTHER_LANGUAGE;
-            } else if(qualityLevel.equals("E")) {
-            qualityLevel = POEMS_OR_SONGS;
-            } else if(qualityLevel.equals("I")) {
-            qualityLevel = SINGLE_MRI_SENTENCE;
-            } else if(qualityLevel.equals("T")) {
-            qualityLevel = LINK_TEXT;
+            }
-            // else remains at whatever was already in the file or
-            // else "" if no qualityLevel column for this record present in the file yet
-            */
             // Force valid values or ""
 …
         if(csvRecord.isSet(QUALITY_LEVEL_COLUMN)) {
             qualityLevel = csvRecord.get(QUALITY_LEVEL_COLUMN);
             // Force valid values or ""
             qualityLevel = getFullQualityLevelNameUppercased(qualityLevel);
+        }
+        }
         if(terminate || !qualityLevel.equals(fieldValue)
            /* || basicURL.equals("paekupu.co.nz") // when reviewing MIXED_TEXT */
 …
+    }
+    /**
+     * Add 2 new columns to the csv file: num pages in site that are inMRI and total num pages in site.
+     */
+     public void insertTotalsIntoCSVRecords() {
+    boolean terminate = false;
+    CSVParser parser = null;
+    try {
+        parser = CSVParser.parse(webPageURLsCSVFile, java.nio.charset.Charset.forName("US-ASCII"), CSVFormat.RFC4180);
+    } catch(Exception e) {
+        logger.error("Failed to parse input CSV file " + Utility.getFilePath(webPageURLsCSVFile), e);
+        return;
+    }
+    try (
+         CSVPrinter csvWriter = new CSVPrinter(new FileWriter(tmpOutFile), CSVFormat.DEFAULT.withQuoteMode(QuoteMode.MINIMAL));
+         ) {
+        int recordCount = 0;
+        for (CSVRecord csvRecord : parser) {
+        String url = csvRecord.get(URL_COLUMN);
+        if(url.equals("")) { // skip empty lines
+            continue;
+        }
+        recordCount++;
+        String basicURL = Utility.stripProtocolAndWWWFromURL(Utility.getDomainForURL(url, false));
+        String countryCode = csvRecord.get(COUNTRY_CODE_COLUMN);
+        String isReallyInMRI = "";
+        String qualityLevel = "";
+        if(csvRecord.isSet(IS_REALLY_IN_MRI_COLUMN)) {
+            isReallyInMRI = csvRecord.get(IS_REALLY_IN_MRI_COLUMN);
+        }
+        if(csvRecord.isSet(QUALITY_LEVEL_COLUMN)) {
+            qualityLevel = csvRecord.get(QUALITY_LEVEL_COLUMN);
+        }
+        //COUNT_OF_PAGES_IN_MRI_COLUMN; TOTAL_PAGES_IN_SITE_COLUMN;
+        long countNumPagesInMRI = mongodbQueryer.getFieldTotalForDomainSuffix(
+                      basicURL, MongoDBQueryer.FIELD_NUM_PAGES_IN_MRI);
+        long countTotalPages = mongodbQueryer.getFieldTotalForDomainSuffix(
+                     basicURL, MongoDBQueryer.FIELD_TOTAL_PAGES);
+        logger.info("Got record " + recordCount + ": " + url + " - " + countryCode
+                + " - " + isReallyInMRI + " - " + qualityLevel
+                + " - " + countNumPagesInMRI + " - " + countTotalPages);
+        // Save the CSV record into the tmp file with the 2 counts columns
+        csvWriter.printRecord(url, countryCode, isReallyInMRI, qualityLevel,
+                      countNumPagesInMRI, countTotalPages);
+        }
+    } catch(Exception e) {
+        e.printStackTrace();
+        logger.error("Exception occurred when processing CSV file or writing out file:\n"
+             + Utility.getFilePath(tmpOutFile));
+        logger.error(e.getMessage(), e);
+    }
+    }
     public static void printUsage() {
     System.err.println("Usage: ManualURLInspection webPageURLs.txt");
 …
+        inspector.reviewQualityLevelFieldFor("SINGLE_MRI_SENTENCE");
+        //inspector.reviewQualityLevelFieldFor("SINGLE_MRI_SENTENCE");
+        inspector.insertTotalsIntoCSVRecords();
         //logger.info("Generated temp CSV file: " + filename);

other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBQueryer.java

-              r33940
+              r33963
 import com.mongodb.client.AggregateIterable;
+import com.mongodb.client.FindIterable;
 import com.mongodb.client.MongoCollection;
 …
     public static final int CONTAINS_MRI = 1;
+    /** Some reused fieldnames in the Websites collection */
+    private static final String FILTER_NUMPAGES_IN_MRI = "numPagesInMRI";
+    private static final String FILTER_NUMPAGES_CONTAINING_MRI = "numPagesContainingMRI";
+    /** Some field names */
+    static final int FIELD_NUM_PAGES_IN_MRI = 0;
+    static final int FIELD_TOTAL_PAGES = 1;
     public MongoDBQueryer() throws Exception {
 …
     return result;
+    }
+    /**
+     * @param domainWithoutProtocolOrWWWPrefix
+     * @return numPagesInMRI field's value for the given domain without protocol or www prefix.
+     * Note that multiple sites may match the same partial domain, so they'll have been
+     * totalled before returning the final value.
+     */
+    public long getFieldTotalForDomainSuffix(
+             String domainWithoutProtocolOrWWWPrefix, int fieldToTotal)
+    {
+    MongoCollection<Document> collection = getWebsitesCollection();
+    // escape dots in domain for regex to get the query string
+    String regexDomain = domainWithoutProtocolOrWWWPrefix.replace(".", "\\.");
+    String query = "{domain: /DOMAIN/}".replace("DOMAIN", regexDomain);
+    BasicDBObject findObj = BasicDBObject.parse(query);
+    String projection = (fieldToTotal == FIELD_NUM_PAGES_IN_MRI) ?
+        "{numPagesInMRI: 1, _id: 0}" : "{totalPages: 1, _id: 0}";
+    BasicDBObject projectionObj = BasicDBObject.parse(projection);
+    FindIterable<Document> docs = collection.find(findObj).projection(projectionObj);
+    long sum = 0;
+    for (Document doc : docs) {
+        //System.out.println(doc.toJson());
+        // both the numPagesInMRI and totalPages fields are int
+        String fieldName = (fieldToTotal == FIELD_NUM_PAGES_IN_MRI) ?
+        "numPagesInMRI" : "totalPages";
+        sum += doc.getInteger(fieldName);
+    }
+    return sum;
+    }
     /**

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 33963

Legend:

other-projects/maori-lang-detection/src/org/greenstone/atea/ManualURLInspection.java

other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBQueryer.java

Download in other formats: