Changeset 33913
- Timestamp:
- 2020-02-12T21:27:02+13:00 (4 years ago)
- Location:
- other-projects/maori-lang-detection
- Files:
-
- 4 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/maori-lang-detection/MoreReading/mongodb.txt
r33905 r33913 1598 1598 In gedit replace 1599 1599 \/\*\s*\d+\s*\*\/ => , 1600 1601 ---------- 1602 1603 https://www.techdirt.com/articles/20160413/12012834171/how-bad-are-geolocation-tools-really-really-bad.shtml 1604 https://stackoverflow.com/questions/28740077/how-to-find-historical-geolocation-for-an-ip-address-perhaps-using-maxmind 1605 https://serverfault.com/questions/59167/how-often-do-ip-blocks-get-reassigned-to-different-regions 1606 1607 GEDIT: Regex find and replace at start 1608 "https?\:\/\/(www.)? 1609 ^[^"]*"https?\:\/\/(www.)? 1610 1611 and at end 1612 ", 1613 1614 ----------------------- 1615 GEOLOCATION CHANGES AFTER REINGESTING UPON INTRODUCING ANGLICAN.ORG: 1616 ----------------------- 1617 NZ the same as before 1618 NL, DE, FR, DK, ES, GB same 1619 IT, AT, RO, CH, RU, BG, MX, JP, CN, IE, IR, FI same 1620 1621 US gained 3: 1622 anglican.org (NEW) 1623 articles.imperialtometric.com (from CA) 1624 daandehn.com (CA) 1625 1626 CA lost 2: 1627 articles.imperialtometric.com (to US) 1628 daandehn.com (to US) 1629 1630 AU: 1631 lost kiwiproperty.com (to US - mi in URL path version file!) 1632 1633 1634 CZ: 1635 gained viveipcl.com (from UNKNOWN) 1636 1637 UNKNOWN: 1638 gained hitiaotera.com from IL 1639 1640 IL: 1641 lost one to (UNKNOWN) 1642 -
other-projects/maori-lang-detection/hdfs-cc-work/GS_README.TXT
r33905 r33913 700 700 db.getCollection('Websites').find({}).count() 701 701 1445 702 703 # Number of distinct domains (ignores protocol and www prefix) 704 db.Websites.distinct('basicDomain').length 705 1220 702 706 703 707 # Num webpages -
other-projects/maori-lang-detection/mongodb-data/tables.txt
r33894 r33913 144 144 $and: [ 145 145 {geoLocationCountryCode: {$ne: "NZ"}}, 146 {domain: {$not: /\.nz /}},147 {numPagesContainingMRI: {$gt: 0}}, 148 {$or: [{geoLocationCountryCode: "AU"}, {urlContainsLangCodeInPath: false}]} 146 {domain: {$not: /\.nz$/}}, 147 {numPagesContainingMRI: {$gt: 0}}, 148 {$or: [{geoLocationCountryCode: "AU"}, {urlContainsLangCodeInPath: false}]} 149 149 ] 150 150 } … … 171 171 $and: [ 172 172 {numPagesContainingMRI: {$gt: 0}}, 173 {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz /}]}173 {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz$/}]} 174 174 ] 175 175 } … … 198 198 $and: [ 199 199 {numPagesInMRI: {$gt: 0}}, 200 {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz /}]}200 {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz$/}]} 201 201 ] 202 202 } … … 218 218 219 219 5b. Table 5b: 220 Table of count of sites with numPagesCo MRI > 0220 Table of count of sites with numPagesContainingMRI > 0 221 221 222 222 Combine the following two: … … 229 229 $and: [ 230 230 {geoLocationCountryCode: {$ne: "NZ"}}, 231 {domain: {$not: /\.nz /}},231 {domain: {$not: /\.nz$/}}, 232 232 {numPagesContainingMRI: {$gt: 0}} 233 233 ] … … 255 255 $and: [ 256 256 {numPagesContainingMRI: {$gt: 0}}, 257 {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz /}]}257 {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz$/}]} 258 258 ] 259 259 } -
other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBQueryer.java
r33912 r33913 359 359 // https://stackoverflow.com/questions/28790784/java-8-preferred-way-to-count-iterations-of-a-lambda 360 360 Holder<Integer> docNum = new Holder<>(1); 361 361 362 /* 362 363 Bson orQuery = or( 363 364 BasicDBObject.parse("{geoLocationCountryCode: \"AU\"}"), … … 365 366 // e.g. "{urlContainsLangCodeInPath: false}" 366 367 ); 368 */ 367 369 Bson andQuery = and( 368 BasicDBObject.parse("{geoLocationCountryCode: {$ne: \"NZ\"}}"),369 BasicDBObject.parse("{domain: {$not: /\\.nz$/}}"),370 BasicDBObject.parse(mriFilterString),371 370 BasicDBObject.parse(mriFilterString), 371 BasicDBObject.parse("{geoLocationCountryCode: {$ne: \"NZ\"}}"), 372 BasicDBObject.parse("{domain: {$not: /\\.nz$/}}"), 373 BasicDBObject.parse("{urlContainsLangCodeInPath: "+ isMiInURLPath +"}"));//orQuery); 372 374 373 375 collection.aggregate(Arrays.asList( … … 652 654 orQuery = or( 653 655 BasicDBObject.parse("{geoLocationCountryCode: /(NZ|AU)/}"), 656 //BasicDBObject.parse("{geoLocationCountryCode: \"NZ\"}"), 654 657 BasicDBObject.parse("{domain: /\\.nz$/}"), 655 658 BasicDBObject.parse("{urlContainsLangCodeInPath: false}")
Note:
See TracChangeset
for help on using the changeset viewer.