Changeset 33913


Ignore:
Timestamp:
2020-02-12T21:27:02+13:00 (4 years ago)
Author:
ak19
Message:
  1. Adjusted table mongodb query statements to be more exact, but same results. 2. Adjusted code to not treat Australia specially, as the AU site with mi in URL path has now shifted to US. 3. Differences in geoLocation results from previous mongoDB ingest to present one documented for cases not dealing with mi in URL path of overseas domains. 4.
Location:
other-projects/maori-lang-detection
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/MoreReading/mongodb.txt

    r33905 r33913  
    15981598In gedit replace
    15991599\/\*\s*\d+\s*\*\/ => ,
     1600
     1601----------
     1602
     1603https://www.techdirt.com/articles/20160413/12012834171/how-bad-are-geolocation-tools-really-really-bad.shtml
     1604https://stackoverflow.com/questions/28740077/how-to-find-historical-geolocation-for-an-ip-address-perhaps-using-maxmind
     1605https://serverfault.com/questions/59167/how-often-do-ip-blocks-get-reassigned-to-different-regions
     1606
     1607GEDIT: Regex find and replace at start
     1608       "https?\:\/\/(www.)?
     1609^[^"]*"https?\:\/\/(www.)?
     1610
     1611and at end
     1612    ",
     1613
     1614-----------------------
     1615GEOLOCATION CHANGES AFTER REINGESTING UPON INTRODUCING ANGLICAN.ORG:
     1616-----------------------
     1617NZ the same as before
     1618   NL, DE, FR, DK, ES, GB same
     1619   IT, AT, RO, CH, RU, BG, MX, JP, CN, IE, IR, FI same
     1620
     1621US gained 3:
     1622anglican.org (NEW)
     1623articles.imperialtometric.com (from CA)
     1624daandehn.com (CA)
     1625
     1626CA lost 2:
     1627articles.imperialtometric.com (to US)
     1628daandehn.com (to US)
     1629
     1630AU:
     1631lost kiwiproperty.com (to US - mi in URL path version file!)
     1632
     1633
     1634CZ:
     1635gained viveipcl.com (from UNKNOWN)
     1636
     1637UNKNOWN:
     1638gained hitiaotera.com from IL
     1639
     1640IL:
     1641lost one to (UNKNOWN)
     1642
  • other-projects/maori-lang-detection/hdfs-cc-work/GS_README.TXT

    r33905 r33913  
    700700db.getCollection('Websites').find({}).count()
    7017011445
     702
     703# Number of distinct domains (ignores protocol and www prefix)
     704db.Websites.distinct('basicDomain').length
     7051220
    702706
    703707# Num webpages
  • other-projects/maori-lang-detection/mongodb-data/tables.txt

    r33894 r33913  
    144144            $and: [
    145145                {geoLocationCountryCode: {$ne: "NZ"}},
    146                 {domain: {$not: /\.nz/}},
    147                 {numPagesContainingMRI: {$gt: 0}},
    148                 {$or: [{geoLocationCountryCode: "AU"}, {urlContainsLangCodeInPath: false}]}           
     146                {domain: {$not: /\.nz$/}},
     147                {numPagesContainingMRI: {$gt: 0}},
     148                {$or: [{geoLocationCountryCode: "AU"}, {urlContainsLangCodeInPath: false}]}
    149149            ]
    150150        }
     
    171171            $and: [
    172172                {numPagesContainingMRI: {$gt: 0}},
    173                 {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz/}]}
     173                {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz$/}]}
    174174            ]
    175175        }
     
    198198            $and: [
    199199                {numPagesInMRI: {$gt: 0}},
    200                 {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz/}]}
     200                {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz$/}]}
    201201            ]
    202202        }
     
    218218
    2192195b. Table 5b:
    220 Table of count of sites with numPagesCoMRI > 0
     220Table of count of sites with numPagesContainingMRI > 0
    221221
    222222Combine the following two:
     
    229229            $and: [
    230230                {geoLocationCountryCode: {$ne: "NZ"}},
    231                 {domain: {$not: /\.nz/}},
     231                {domain: {$not: /\.nz$/}},
    232232                {numPagesContainingMRI: {$gt: 0}}
    233233            ]
     
    255255            $and: [
    256256                {numPagesContainingMRI: {$gt: 0}},
    257                 {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz/}]}
     257                {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz$/}]}
    258258            ]
    259259        }
  • other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBQueryer.java

    r33912 r33913  
    359359    // https://stackoverflow.com/questions/28790784/java-8-preferred-way-to-count-iterations-of-a-lambda
    360360    Holder<Integer> docNum = new Holder<>(1);
    361    
     361
     362    /*
    362363    Bson orQuery = or(
    363364              BasicDBObject.parse("{geoLocationCountryCode: \"AU\"}"),
     
    365366              // e.g. "{urlContainsLangCodeInPath: false}"
    366367              );
     368    */
    367369    Bson andQuery = and(
    368         BasicDBObject.parse("{geoLocationCountryCode: {$ne: \"NZ\"}}"),
    369         BasicDBObject.parse("{domain: {$not: /\\.nz$/}}"),
    370         BasicDBObject.parse(mriFilterString),
    371         orQuery);
     370        BasicDBObject.parse(mriFilterString),
     371        BasicDBObject.parse("{geoLocationCountryCode: {$ne: \"NZ\"}}"),
     372        BasicDBObject.parse("{domain: {$not: /\\.nz$/}}"),     
     373        BasicDBObject.parse("{urlContainsLangCodeInPath: "+ isMiInURLPath +"}"));//orQuery);
    372374
    373375    collection.aggregate(Arrays.asList(
     
    652654        orQuery = or(
    653655              BasicDBObject.parse("{geoLocationCountryCode: /(NZ|AU)/}"),
     656             //BasicDBObject.parse("{geoLocationCountryCode: \"NZ\"}"),
    654657              BasicDBObject.parse("{domain: /\\.nz$/}"),
    655658              BasicDBObject.parse("{urlContainsLangCodeInPath: false}")
Note: See TracChangeset for help on using the changeset viewer.