Changeset 33963
- Timestamp:
- 2020-02-20T22:12:43+13:00 (4 years ago)
- Location:
- other-projects/maori-lang-detection/src/org/greenstone/atea
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/maori-lang-detection/src/org/greenstone/atea/ManualURLInspection.java
r33961 r33963 46 46 public static final int IS_REALLY_IN_MRI_COLUMN = 2; 47 47 public static final int QUALITY_LEVEL_COLUMN = 3; 48 48 public static final int COUNT_OF_PAGES_IN_MRI_COLUMN = 4; // count as detected by OpenNLP 49 public static final int TOTAL_PAGES_IN_SITE_COLUMN = 5; 49 50 50 51 /** Possible values for the Quality Level column of the csv file */ … … 317 318 if(csvRecord.isSet(QUALITY_LEVEL_COLUMN)) { 318 319 qualityLevel = csvRecord.get(QUALITY_LEVEL_COLUMN); 319 320 /*321 qualityLevel = qualityLevel.toUpperCase();322 323 if(qualityLevel.equals("N")) {324 qualityLevel = NAV;325 } else if(qualityLevel.equals("L")) {326 qualityLevel = LITTLE_TEXT;327 } else if(qualityLevel.equals("M")) {328 qualityLevel = MIXED_TEXT;329 } else if(qualityLevel.equals("P")) {330 qualityLevel = MAORI_PARAGRAPHS;331 } else if(qualityLevel.equals("S")) {332 qualityLevel = SIGNIFICANTLY_MAORI;333 } else if(qualityLevel.equals("W")) {334 qualityLevel = WORDS;335 } else if(qualityLevel.equals("O")) {336 qualityLevel = OTHER_LANGUAGE;337 } else if(qualityLevel.equals("E")) {338 qualityLevel = POEMS_OR_SONGS;339 } else if(qualityLevel.equals("I")) {340 qualityLevel = SINGLE_MRI_SENTENCE;341 } else if(qualityLevel.equals("T")) {342 qualityLevel = LINK_TEXT;343 }344 // else remains at whatever was already in the file or345 // else "" if no qualityLevel column for this record present in the file yet346 */347 320 348 321 // Force valid values or "" … … 550 523 if(csvRecord.isSet(QUALITY_LEVEL_COLUMN)) { 551 524 qualityLevel = csvRecord.get(QUALITY_LEVEL_COLUMN); 552 525 553 526 // Force valid values or "" 554 527 qualityLevel = getFullQualityLevelNameUppercased(qualityLevel); 555 } 556 528 } 529 530 557 531 if(terminate || !qualityLevel.equals(fieldValue) 558 532 /* || basicURL.equals("paekupu.co.nz") // when reviewing MIXED_TEXT */ … … 648 622 649 623 } 650 624 625 /** 626 * Add 2 new columns to the csv file: num pages in site that are inMRI and total num pages in site. 627 */ 628 public void insertTotalsIntoCSVRecords() { 629 630 boolean terminate = false; 631 CSVParser parser = null; 632 633 try { 634 parser = CSVParser.parse(webPageURLsCSVFile, java.nio.charset.Charset.forName("US-ASCII"), CSVFormat.RFC4180); 635 } catch(Exception e) { 636 logger.error("Failed to parse input CSV file " + Utility.getFilePath(webPageURLsCSVFile), e); 637 return; 638 } 639 640 try ( 641 CSVPrinter csvWriter = new CSVPrinter(new FileWriter(tmpOutFile), CSVFormat.DEFAULT.withQuoteMode(QuoteMode.MINIMAL)); 642 ) { 643 644 int recordCount = 0; 645 for (CSVRecord csvRecord : parser) { 646 647 String url = csvRecord.get(URL_COLUMN); 648 if(url.equals("")) { // skip empty lines 649 continue; 650 } 651 652 recordCount++; 653 654 String basicURL = Utility.stripProtocolAndWWWFromURL(Utility.getDomainForURL(url, false)); 655 656 String countryCode = csvRecord.get(COUNTRY_CODE_COLUMN); 657 String isReallyInMRI = ""; 658 String qualityLevel = ""; 659 660 if(csvRecord.isSet(IS_REALLY_IN_MRI_COLUMN)) { 661 isReallyInMRI = csvRecord.get(IS_REALLY_IN_MRI_COLUMN); 662 } 663 664 if(csvRecord.isSet(QUALITY_LEVEL_COLUMN)) { 665 qualityLevel = csvRecord.get(QUALITY_LEVEL_COLUMN); 666 } 667 668 //COUNT_OF_PAGES_IN_MRI_COLUMN; TOTAL_PAGES_IN_SITE_COLUMN; 669 long countNumPagesInMRI = mongodbQueryer.getFieldTotalForDomainSuffix( 670 basicURL, MongoDBQueryer.FIELD_NUM_PAGES_IN_MRI); 671 long countTotalPages = mongodbQueryer.getFieldTotalForDomainSuffix( 672 basicURL, MongoDBQueryer.FIELD_TOTAL_PAGES); 673 674 logger.info("Got record " + recordCount + ": " + url + " - " + countryCode 675 + " - " + isReallyInMRI + " - " + qualityLevel 676 + " - " + countNumPagesInMRI + " - " + countTotalPages); 677 678 // Save the CSV record into the tmp file with the 2 counts columns 679 csvWriter.printRecord(url, countryCode, isReallyInMRI, qualityLevel, 680 countNumPagesInMRI, countTotalPages); 681 } 682 683 } catch(Exception e) { 684 e.printStackTrace(); 685 logger.error("Exception occurred when processing CSV file or writing out file:\n" 686 + Utility.getFilePath(tmpOutFile)); 687 logger.error(e.getMessage(), e); 688 } 689 690 } 691 692 651 693 public static void printUsage() { 652 694 System.err.println("Usage: ManualURLInspection webPageURLs.txt"); … … 711 753 712 754 713 inspector.reviewQualityLevelFieldFor("SINGLE_MRI_SENTENCE"); 755 //inspector.reviewQualityLevelFieldFor("SINGLE_MRI_SENTENCE"); 756 757 inspector.insertTotalsIntoCSVRecords(); 714 758 715 759 //logger.info("Generated temp CSV file: " + filename); -
other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBQueryer.java
r33940 r33963 4 4 5 5 import com.mongodb.client.AggregateIterable; 6 import com.mongodb.client.FindIterable; 6 7 import com.mongodb.client.MongoCollection; 7 8 … … 94 95 public static final int CONTAINS_MRI = 1; 95 96 96 /** Some reused fieldnames in the Websites collection */ 97 private static final String FILTER_NUMPAGES_IN_MRI = "numPagesInMRI"; 98 private static final String FILTER_NUMPAGES_CONTAINING_MRI = "numPagesContainingMRI"; 99 100 97 /** Some field names */ 98 static final int FIELD_NUM_PAGES_IN_MRI = 0; 99 static final int FIELD_TOTAL_PAGES = 1; 100 101 101 102 102 public MongoDBQueryer() throws Exception { … … 251 251 return result; 252 252 } 253 254 /** 255 * @param domainWithoutProtocolOrWWWPrefix 256 * @return numPagesInMRI field's value for the given domain without protocol or www prefix. 257 * Note that multiple sites may match the same partial domain, so they'll have been 258 * totalled before returning the final value. 259 */ 260 public long getFieldTotalForDomainSuffix( 261 String domainWithoutProtocolOrWWWPrefix, int fieldToTotal) 262 { 263 MongoCollection<Document> collection = getWebsitesCollection(); 264 265 // escape dots in domain for regex to get the query string 266 String regexDomain = domainWithoutProtocolOrWWWPrefix.replace(".", "\\."); 267 String query = "{domain: /DOMAIN/}".replace("DOMAIN", regexDomain); 268 BasicDBObject findObj = BasicDBObject.parse(query); 269 270 String projection = (fieldToTotal == FIELD_NUM_PAGES_IN_MRI) ? 271 "{numPagesInMRI: 1, _id: 0}" : "{totalPages: 1, _id: 0}"; 272 BasicDBObject projectionObj = BasicDBObject.parse(projection); 273 274 FindIterable<Document> docs = collection.find(findObj).projection(projectionObj); 275 276 long sum = 0; 277 for (Document doc : docs) { 278 //System.out.println(doc.toJson()); 279 // both the numPagesInMRI and totalPages fields are int 280 String fieldName = (fieldToTotal == FIELD_NUM_PAGES_IN_MRI) ? 281 "numPagesInMRI" : "totalPages"; 282 sum += doc.getInteger(fieldName); 283 } 284 285 return sum; 286 } 287 253 288 254 289 /**
Note:
See TracChangeset
for help on using the changeset viewer.