Changeset 33913

Show
Ignore:
Timestamp:
12.02.2020 21:27:02 (6 days ago)
Author:
ak19
Message:

1. Adjusted table mongodb query statements to be more exact, but same results. 2. Adjusted code to not treat Australia specially, as the AU site with mi in URL path has now shifted to US. 3. Differences in geoLocation results from previous mongoDB ingest to present one documented for cases not dealing with mi in URL path of overseas domains. 4.

Location:
other-projects/maori-lang-detection
Files:
4 modified

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/MoreReading/mongodb.txt

    r33905 r33913  
    15981598In gedit replace 
    15991599\/\*\s*\d+\s*\*\/ => , 
     1600 
     1601---------- 
     1602 
     1603https://www.techdirt.com/articles/20160413/12012834171/how-bad-are-geolocation-tools-really-really-bad.shtml 
     1604https://stackoverflow.com/questions/28740077/how-to-find-historical-geolocation-for-an-ip-address-perhaps-using-maxmind 
     1605https://serverfault.com/questions/59167/how-often-do-ip-blocks-get-reassigned-to-different-regions 
     1606 
     1607GEDIT: Regex find and replace at start 
     1608       "https?\:\/\/(www.)? 
     1609^[^"]*"https?\:\/\/(www.)? 
     1610 
     1611and at end 
     1612    ", 
     1613 
     1614----------------------- 
     1615GEOLOCATION CHANGES AFTER REINGESTING UPON INTRODUCING ANGLICAN.ORG: 
     1616----------------------- 
     1617NZ the same as before 
     1618   NL, DE, FR, DK, ES, GB same 
     1619   IT, AT, RO, CH, RU, BG, MX, JP, CN, IE, IR, FI same 
     1620 
     1621US gained 3: 
     1622anglican.org (NEW) 
     1623articles.imperialtometric.com (from CA) 
     1624daandehn.com (CA) 
     1625 
     1626CA lost 2: 
     1627articles.imperialtometric.com (to US) 
     1628daandehn.com (to US) 
     1629 
     1630AU: 
     1631lost kiwiproperty.com (to US - mi in URL path version file!) 
     1632 
     1633 
     1634CZ: 
     1635gained viveipcl.com (from UNKNOWN) 
     1636 
     1637UNKNOWN: 
     1638gained hitiaotera.com from IL 
     1639 
     1640IL: 
     1641lost one to (UNKNOWN) 
     1642 
  • other-projects/maori-lang-detection/hdfs-cc-work/GS_README.TXT

    r33905 r33913  
    700700db.getCollection('Websites').find({}).count() 
    7017011445  
     702 
     703# Number of distinct domains (ignores protocol and www prefix)  
     704db.Websites.distinct('basicDomain').length 
     7051220 
    702706 
    703707# Num webpages 
  • other-projects/maori-lang-detection/mongodb-data/tables.txt

    r33894 r33913  
    144144            $and: [ 
    145145                {geoLocationCountryCode: {$ne: "NZ"}}, 
    146                 {domain: {$not: /\.nz/}}, 
    147                 {numPagesContainingMRI: {$gt: 0}}, 
    148                 {$or: [{geoLocationCountryCode: "AU"}, {urlContainsLangCodeInPath: false}]}             
     146                {domain: {$not: /\.nz$/}}, 
     147                {numPagesContainingMRI: {$gt: 0}}, 
     148                {$or: [{geoLocationCountryCode: "AU"}, {urlContainsLangCodeInPath: false}]} 
    149149            ] 
    150150        } 
     
    171171            $and: [ 
    172172                {numPagesContainingMRI: {$gt: 0}}, 
    173                 {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz/}]} 
     173                {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz$/}]} 
    174174            ] 
    175175        } 
     
    198198            $and: [ 
    199199                {numPagesInMRI: {$gt: 0}}, 
    200                 {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz/}]} 
     200                {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz$/}]} 
    201201            ] 
    202202        } 
     
    218218 
    2192195b. Table 5b: 
    220 Table of count of sites with numPagesCoMRI > 0 
     220Table of count of sites with numPagesContainingMRI > 0 
    221221 
    222222Combine the following two: 
     
    229229            $and: [ 
    230230                {geoLocationCountryCode: {$ne: "NZ"}}, 
    231                 {domain: {$not: /\.nz/}}, 
     231                {domain: {$not: /\.nz$/}}, 
    232232                {numPagesContainingMRI: {$gt: 0}} 
    233233            ] 
     
    255255            $and: [ 
    256256                {numPagesContainingMRI: {$gt: 0}}, 
    257                 {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz/}]} 
     257                {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz$/}]} 
    258258            ] 
    259259        } 
  • other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBQueryer.java

    r33912 r33913  
    359359    // https://stackoverflow.com/questions/28790784/java-8-preferred-way-to-count-iterations-of-a-lambda 
    360360    Holder<Integer> docNum = new Holder<>(1); 
    361      
     361 
     362    /* 
    362363    Bson orQuery = or( 
    363364              BasicDBObject.parse("{geoLocationCountryCode: \"AU\"}"), 
     
    365366              // e.g. "{urlContainsLangCodeInPath: false}" 
    366367              ); 
     368    */ 
    367369    Bson andQuery = and( 
    368         BasicDBObject.parse("{geoLocationCountryCode: {$ne: \"NZ\"}}"), 
    369         BasicDBObject.parse("{domain: {$not: /\\.nz$/}}"), 
    370         BasicDBObject.parse(mriFilterString), 
    371         orQuery); 
     370        BasicDBObject.parse(mriFilterString), 
     371        BasicDBObject.parse("{geoLocationCountryCode: {$ne: \"NZ\"}}"), 
     372        BasicDBObject.parse("{domain: {$not: /\\.nz$/}}"),       
     373        BasicDBObject.parse("{urlContainsLangCodeInPath: "+ isMiInURLPath +"}"));//orQuery); 
    372374 
    373375    collection.aggregate(Arrays.asList( 
     
    652654        orQuery = or( 
    653655              BasicDBObject.parse("{geoLocationCountryCode: /(NZ|AU)/}"), 
     656             //BasicDBObject.parse("{geoLocationCountryCode: \"NZ\"}"), 
    654657              BasicDBObject.parse("{domain: /\\.nz$/}"), 
    655658              BasicDBObject.parse("{urlContainsLangCodeInPath: false}")