Changeset 33963


Ignore:
Timestamp:
2020-02-20T22:12:43+13:00 (4 years ago)
Author:
ak19
Message:

Added a new helper method to MongoDBQueryer.java to add numPagesInMRI and totalPages totals columns into the csv file

Location:
other-projects/maori-lang-detection/src/org/greenstone/atea
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/ManualURLInspection.java

    r33961 r33963  
    4646    public static final int IS_REALLY_IN_MRI_COLUMN = 2;
    4747    public static final int QUALITY_LEVEL_COLUMN = 3;
    48 
     48    public static final int COUNT_OF_PAGES_IN_MRI_COLUMN = 4; // count as detected by OpenNLP
     49    public static final int TOTAL_PAGES_IN_SITE_COLUMN = 5;
    4950
    5051    /** Possible values for the Quality Level column of the csv file */
     
    317318        if(csvRecord.isSet(QUALITY_LEVEL_COLUMN)) {
    318319            qualityLevel = csvRecord.get(QUALITY_LEVEL_COLUMN);
    319 
    320             /*
    321             qualityLevel = qualityLevel.toUpperCase();
    322            
    323             if(qualityLevel.equals("N")) {
    324             qualityLevel = NAV;
    325             } else if(qualityLevel.equals("L")) {
    326             qualityLevel = LITTLE_TEXT;
    327             } else if(qualityLevel.equals("M")) {
    328             qualityLevel = MIXED_TEXT;
    329             } else if(qualityLevel.equals("P")) {
    330             qualityLevel = MAORI_PARAGRAPHS;
    331             } else if(qualityLevel.equals("S")) {
    332             qualityLevel = SIGNIFICANTLY_MAORI;
    333             } else if(qualityLevel.equals("W")) {
    334             qualityLevel = WORDS;
    335             } else if(qualityLevel.equals("O")) {
    336             qualityLevel = OTHER_LANGUAGE;
    337             } else if(qualityLevel.equals("E")) {
    338             qualityLevel = POEMS_OR_SONGS;
    339             } else if(qualityLevel.equals("I")) {
    340             qualityLevel = SINGLE_MRI_SENTENCE;
    341             } else if(qualityLevel.equals("T")) {
    342             qualityLevel = LINK_TEXT;
    343             }
    344             // else remains at whatever was already in the file or
    345             // else "" if no qualityLevel column for this record present in the file yet
    346             */
    347320
    348321            // Force valid values or ""
     
    550523        if(csvRecord.isSet(QUALITY_LEVEL_COLUMN)) {
    551524            qualityLevel = csvRecord.get(QUALITY_LEVEL_COLUMN);
    552 
     525           
    553526            // Force valid values or ""
    554527            qualityLevel = getFullQualityLevelNameUppercased(qualityLevel);
    555         }       
    556        
     528        }
     529       
     530           
    557531        if(terminate || !qualityLevel.equals(fieldValue)
    558532           /* || basicURL.equals("paekupu.co.nz") // when reviewing MIXED_TEXT */
     
    648622
    649623    }
    650    
     624
     625    /**
     626     * Add 2 new columns to the csv file: num pages in site that are inMRI and total num pages in site.
     627     */   
     628     public void insertTotalsIntoCSVRecords() {
     629   
     630    boolean terminate = false;
     631    CSVParser parser = null;
     632   
     633    try {
     634        parser = CSVParser.parse(webPageURLsCSVFile, java.nio.charset.Charset.forName("US-ASCII"), CSVFormat.RFC4180);
     635    } catch(Exception e) {
     636        logger.error("Failed to parse input CSV file " + Utility.getFilePath(webPageURLsCSVFile), e);
     637        return;
     638    }
     639   
     640    try (
     641         CSVPrinter csvWriter = new CSVPrinter(new FileWriter(tmpOutFile), CSVFormat.DEFAULT.withQuoteMode(QuoteMode.MINIMAL));
     642         ) {
     643
     644        int recordCount = 0;
     645        for (CSVRecord csvRecord : parser) {       
     646       
     647        String url = csvRecord.get(URL_COLUMN);
     648        if(url.equals("")) { // skip empty lines
     649            continue;
     650        }
     651       
     652        recordCount++;
     653       
     654        String basicURL = Utility.stripProtocolAndWWWFromURL(Utility.getDomainForURL(url, false));
     655
     656        String countryCode = csvRecord.get(COUNTRY_CODE_COLUMN);
     657        String isReallyInMRI = "";
     658        String qualityLevel = "";
     659       
     660        if(csvRecord.isSet(IS_REALLY_IN_MRI_COLUMN)) {
     661            isReallyInMRI = csvRecord.get(IS_REALLY_IN_MRI_COLUMN);
     662        }
     663       
     664        if(csvRecord.isSet(QUALITY_LEVEL_COLUMN)) {
     665            qualityLevel = csvRecord.get(QUALITY_LEVEL_COLUMN);
     666        }       
     667       
     668        //COUNT_OF_PAGES_IN_MRI_COLUMN; TOTAL_PAGES_IN_SITE_COLUMN;
     669        long countNumPagesInMRI = mongodbQueryer.getFieldTotalForDomainSuffix(
     670                      basicURL, MongoDBQueryer.FIELD_NUM_PAGES_IN_MRI);
     671        long countTotalPages = mongodbQueryer.getFieldTotalForDomainSuffix(
     672                     basicURL, MongoDBQueryer.FIELD_TOTAL_PAGES);
     673
     674        logger.info("Got record " + recordCount + ": " + url + " - " + countryCode
     675                + " - " + isReallyInMRI + " - " + qualityLevel
     676                + " - " + countNumPagesInMRI + " - " + countTotalPages);
     677
     678        // Save the CSV record into the tmp file with the 2 counts columns
     679        csvWriter.printRecord(url, countryCode, isReallyInMRI, qualityLevel,
     680                      countNumPagesInMRI, countTotalPages);     
     681        }
     682       
     683    } catch(Exception e) {
     684        e.printStackTrace();
     685        logger.error("Exception occurred when processing CSV file or writing out file:\n"
     686             + Utility.getFilePath(tmpOutFile));
     687        logger.error(e.getMessage(), e);
     688    }
     689
     690    }
     691
     692   
    651693    public static void printUsage() {
    652694    System.err.println("Usage: ManualURLInspection webPageURLs.txt");
     
    711753
    712754       
    713         inspector.reviewQualityLevelFieldFor("SINGLE_MRI_SENTENCE");
     755        //inspector.reviewQualityLevelFieldFor("SINGLE_MRI_SENTENCE");
     756
     757        inspector.insertTotalsIntoCSVRecords();
    714758       
    715759        //logger.info("Generated temp CSV file: " + filename);
  • other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBQueryer.java

    r33940 r33963  
    44
    55import com.mongodb.client.AggregateIterable;
     6import com.mongodb.client.FindIterable;
    67import com.mongodb.client.MongoCollection;
    78
     
    9495    public static final int CONTAINS_MRI = 1;
    9596
    96     /** Some reused fieldnames in the Websites collection */
    97     private static final String FILTER_NUMPAGES_IN_MRI = "numPagesInMRI";
    98     private static final String FILTER_NUMPAGES_CONTAINING_MRI = "numPagesContainingMRI";
    99 
    100 
     97    /** Some field names */
     98    static final int FIELD_NUM_PAGES_IN_MRI = 0;
     99    static final int FIELD_TOTAL_PAGES = 1;
     100   
    101101   
    102102    public MongoDBQueryer() throws Exception {
     
    251251    return result;
    252252    }
     253
     254    /**
     255     * @param domainWithoutProtocolOrWWWPrefix
     256     * @return numPagesInMRI field's value for the given domain without protocol or www prefix.
     257     * Note that multiple sites may match the same partial domain, so they'll have been
     258     * totalled before returning the final value.
     259     */
     260    public long getFieldTotalForDomainSuffix(
     261             String domainWithoutProtocolOrWWWPrefix, int fieldToTotal)
     262    {   
     263    MongoCollection<Document> collection = getWebsitesCollection();
     264
     265    // escape dots in domain for regex to get the query string
     266    String regexDomain = domainWithoutProtocolOrWWWPrefix.replace(".", "\\.");
     267    String query = "{domain: /DOMAIN/}".replace("DOMAIN", regexDomain);
     268    BasicDBObject findObj = BasicDBObject.parse(query);
     269
     270    String projection = (fieldToTotal == FIELD_NUM_PAGES_IN_MRI) ?
     271        "{numPagesInMRI: 1, _id: 0}" : "{totalPages: 1, _id: 0}";   
     272    BasicDBObject projectionObj = BasicDBObject.parse(projection);
     273
     274    FindIterable<Document> docs = collection.find(findObj).projection(projectionObj);
     275
     276    long sum = 0;
     277    for (Document doc : docs) {     
     278        //System.out.println(doc.toJson());
     279        // both the numPagesInMRI and totalPages fields are int
     280        String fieldName = (fieldToTotal == FIELD_NUM_PAGES_IN_MRI) ?
     281        "numPagesInMRI" : "totalPages";
     282        sum += doc.getInteger(fieldName);
     283    }   
     284   
     285    return sum;
     286    }
     287   
    253288   
    254289    /**     
Note: See TracChangeset for help on using the changeset viewer.