Changeset 33804


Ignore:
Timestamp:
2019-12-13T20:00:53+13:00 (4 years ago)
Author:
ak19
Message:
  1. Updated results from mongodb querying after yesterday's modifications to NutchTextDumpToMongoDB.java. Mods included removing another adult site (moved to blacklist) and replacing country codes of sites whose origin countries couldn't be detected with its TLD suffix else UNKNOWN instead of the previous null. Adding extra field to Websites mongodb collection called numPagesContainingMRI which counts the number of webpages in a website which has at least one sentence for which OpenNLP predicted its best detectable language as MRI 2. Added new Mongodb queries to work with the numPagesContainingMRI field.
File:
1 edited

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/MoreReading/mongodb.txt

    r33800 r33804  
    454454# Num websites
    455455db.getCollection('Websites').find({}).count()
    456 1446
     4561445
    457457
    458458# Num webpages
     
    464464db.getCollection('Websites').find({numPagesInMRI: { $gt: 0}}).count()
    465465361
     466
     467# Number of sites containing at least one sentence for which OpenNLP detected the best language = MRI
     468db.getCollection('Websites').find({numPagesContainingMRI: {$gt: 0}}).count()
     469868
     470
     471# Obviously, the union of the above two will be identical to numPagesContainingMRI:
     472db.getCollection('Websites').find({ $or: [ { numPagesInMRI: { $gt: 0 } }, { numPagesContainingMRI: {$gt: 0} } ] } ).count()
     473868
    466474
    467475# Find number of webpages that are deemed to be overall in MRI (pages where isMRI=true)
     
    484492# Number of websites that are outside NZ that contain /mi(/) in any of its sub-urls
    485493db.getCollection('Websites').find({urlContainsLangCodeInPath:true, geoLocationCountryCode: {$ne : "NZ"} }).count()
    486 148
     494147
    487495
    488496# 5 sites with URLs containing /mi(/) that are in NZ
    489497db.getCollection('Websites').find({urlContainsLangCodeInPath:true, geoLocationCountryCode: "NZ"}).count()
    490 5
     4986
     499
    491500
    492501# sort websites that contain /mi(/) in path by geoLocationCountryCode
     
    557566        $group: {
    558567            _id: "$geoLocationCountryCode",
     568            count: { $sum: 1 }
     569        }
     570    },
     571    { $sort : { count : -1} }
     572]);
     573
     574// count of country codes for sites that have at least one page detected as MRI
     575
     576db.Websites.aggregate([
     577    {
     578        $match: {
     579            numPagesInMRI: {$gt: 0}
     580        }
     581    },
     582    { $unwind: "$geoLocationCountryCode" },
     583    {
     584        $group: {
     585            _id: {$toLower: '$geoLocationCountryCode'},
     586            count: { $sum: 1 }
     587        }
     588    },
     589    { $sort : { count : -1} }
     590]);
     591
     592// count of country codes for sites that have at least one page containing at least one sentence detected as MRI
     593db.Websites.aggregate([
     594    {
     595        $match: {
     596            numPagesContainingMRI: {$gt: 0}
     597        }
     598    },
     599    { $unwind: "$geoLocationCountryCode" },
     600    {
     601        $group: {
     602            _id: {$toLower: '$geoLocationCountryCode'},
    559603            count: { $sum: 1 }
    560604        }
Note: See TracChangeset for help on using the changeset viewer.