Ignore:
Timestamp:
2020-02-05T23:36:37+13:00 (4 years ago)
Author:
ak19
Message:

Code is intermediate state. 1. Introduced basicDomain field to MongoDB and recreated the MongoDB tables/collections, this will help discount duplicated domains under http and https, with and without www. Though webpage URLs may potentially still be unique and not duplicated across all 4 possible variants, I want them counted under the same base domain name. 2. Another issue noticed now is that some of the sites appear to be hosted on multiple countries servers, and so slightly different country code counts and domainlistings are returned. 3. So added code modifications (untested) to sort the domains alphabetically after stripping protocol and www to allow comparing the old domainListing results of MongoDB's now renamed oldWebsites and oldWebpages collections to the new versions of these collections and to then update the differences in manual counts.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java

    r33887 r33906  
    2626import com.mongodb.Block;
    2727
     28import org.bson.BsonArray;
     29import org.bson.BsonString;
    2830import org.bson.Document;
    2931import org.bson.conversions.Bson;
     
    217219        .append("siteFolderName", website.siteFolderName)
    218220        .append("domain", website.domain)
     221        .append("basicDomain", website.basicDomain)
    219222        .append("totalPages", website.totalPages)
    220223        .append("numPagesWithBodyText", website.countOfWebPagesWithBodyText)
     
    389392     *  and https://stackoverflow.com/questions/47979978/ambiguous-reference-to-foreach-when-listing-mongodbs-database-in-java
    390393     *
    391      *
     394     * Count by country code of non-NZ websites containing a positive number of sentences in MRI,
     395     * listing all the base domain strings (no protocol or www) in ALPHABETICAL ORDER
     396     * and total counts of numPagesInMRI and numPagesContainingMRI across all these
     397     * matching sites.
     398     *
    392399     * The mongodb aggregate() we want to run this time:
    393400     *
     
    405412          $group: {
    406413            _id: "nz",
    407             count: { $sum: 1 },
    408             domain: { $addToSet: '$domain' }
     414            count: { $sum: 1 },           
     415        domain: { $addToSet: '$basicDomain' } // domain: {$push: "$basicDomain" }
    409416          }
    410417    },
     
    431438         match(andQuery),
    432439         unwind("$geoLocationCountryCode"),
    433          group("NZ", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))),
     440         group("NZ", Arrays.asList(sum("count", 1),
     441                   addToSet("domain", "$basicDomain"))),
    434442         sort(BasicDBObject.parse("{count : -1}"))
    435443     )).forEach((Block<Document>)doc -> writeDoc(doc, writer));
     
    441449   
    442450    /**
     451     * Count of NZ (incl .nz TLD)  websites containing a positive number of sentences in MRI,
     452     * listing all the base domain strings (no protocol or www) in ALPHABETICAL ORDER
     453     * and total counts of numPagesInMRI and numPagesContainingMRI across all these
     454     * matching sites.
     455     *
    443456     * The aggregate() we want to run this time:
    444457     *
     
    459472            _id: {$toLower: '$geoLocationCountryCode'},
    460473            count: { $sum: 1 },
    461             domain: { $addToSet: '$domain' }
     474        domain: { $addToSet: '$basicDomain' } // domain: {$push: "$basicDomain" }
    462475          }
    463476     },
     
    484497        orQuery);
    485498
    486 
    487499    collection.aggregate(Arrays.asList(
    488500         match(andQuery),  //match(BasicDBObject.parse(matchQuery))
    489501         // match((List<DBObject>)JSON.parse(matchQuery)),
    490502         unwind("$geoLocationCountryCode"),
    491          group("$geoLocationCountryCode", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))),
     503         group("$geoLocationCountryCode", Arrays.asList(sum("count", 1),
     504                        addToSet("domain", "$basicDomain"))),
    492505         sort(BasicDBObject.parse("{count : -1}"))
    493506       )).forEach((Block<Document>)doc -> writeDoc(doc, writer));
     
    639652     */   
    640653    public void writeDoc(Document doc, Writer writer) throws UncheckedIOException {
     654
     655    // If there's a domain field in the json Doc, sort this domain listing alphabetically
     656    Object domainList = doc.remove("domain");
     657    if(domainList != null) {
     658        doc.put("domain", sortAlphabetically(domainList));
     659    }
     660   
    641661    //OLD WAY: writer.write(doc.toJson(new JsonWriterSettings(JsonMode.STRICT, true)) + NEWLINE);
    642662    // Can't control json output to add newlines after each array element,
     
    655675    // Have to use gson's pretty print to produce a json string that contains
    656676    // newlines after every array element in the json:
     677   
    657678    String jsonStr = prettyPrintJson(doc.toJson());
    658679    //System.err.println(jsonStr);
     
    663684        throw new UncheckedIOException(ex);
    664685    }       
     686    }
     687
     688    private List sortAlphabetically(Object list) {
     689    BsonArray domainList = (BsonArray)list;
     690    //for(String domain : domainList) {
     691    for(int i = domainList.size() - 1; i >= 0; i--) {
     692        BsonString domain = domainList.get(i).asString();
     693        String domainStr = Utility.stripProtocolAndWWWFromURL(domain.toString());
     694        domainList.set(i, new BsonString(domainStr));       
     695    }
     696
     697    return domainList;
    665698    }
    666699   
Note: See TracChangeset for help on using the changeset viewer.