Changeset 33889 for other-projects


Ignore:
Timestamp:
2020-02-03T15:48:40+13:00 (4 years ago)
Author:
ak19
Message:
  1. Additional column: totalPagesAcrossMatchingSites. 2. Screengrab of the tables.
Location:
other-projects/maori-lang-detection/mongodb-data
Files:
9 added
8 edited

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/mongodb-data/1a_table_miInUrlPath.csv

    r33848 r33889  
    1 "_id","count","numPagesInMRICount","numPagesContainingMRICount"
    2 "US","408.0","1169","3872"
    3 "CN","123.0","281","1144"
    4 "FR","34.0","754","1091"
    5 "UNKNOWN","19.0","115","125"
    6 "NZ","14.0","618","1097"
    7 "DE","12.0","145","212"
    8 "NL","8.0","76","115"
    9 "CA","7.0","29","119"
    10 "HK","7.0","3","12"
    11 "AU","7.0","19","117"
    12 "GB","5.0","3","7"
    13 "JP","5.0","1","3"
    14 "UA","4.0","9","10"
    15 "RU","4.0","0","14"
    16 "VG","2.0","0","0"
    17 "SG","2.0","2","13"
    18 "DK","2.0","0","0"
    19 "IE","1.0","17","21"
    20 "ZA","1.0","0","0"
    21 "TR","1.0","0","2"
    22 "SE","1.0","0","0"
    23 "EU","1.0","0","7"
    24 "CZ","1.0","0","0"
    25 "ES","1.0","4","4"
     1"countryCode","count","numPagesInMRICount","numPagesContainingMRICount","totalPagesAcrossMatchingSites"
     2"US","408.0","1169","3872","199487"
     3"CN","123.0","281","1144","79576"
     4"FR","34.0","754","1091","7983"
     5"UNKNOWN","19.0","115","125","1196"
     6"NZ","14.0","618","1097","5959"
     7"DE","12.0","145","212","5718"
     8"NL","8.0","76","115","8351"
     9"CA","7.0","29","119","11577"
     10"AU","7.0","19","117","549"
     11"HK","7.0","3","12","620"
     12"JP","5.0","1","3","420"
     13"GB","5.0","3","7","1948"
     14"UA","4.0","9","10","1144"
     15"RU","4.0","0","14","30"
     16"DK","2.0","0","0","24"
     17"VG","2.0","0","0","2"
     18"SG","2.0","2","13","1373"
     19"ZA","1.0","0","0","2"
     20"SE","1.0","0","0","6"
     21"TR","1.0","0","2","505"
     22"EU","1.0","0","7","250"
     23"ES","1.0","4","4","3648"
     24"IE","1.0","17","21","451"
     25"CZ","1.0","0","0","1"
     26
  • other-projects/maori-lang-detection/mongodb-data/1b_table_noMiInUrlPath.csv

    r33848 r33889  
    1 "_id","count","numPagesInMRICount","numPagesContainingMRICount"
    2 "US","288.0","2136","5452"
    3 "UNKNOWN","154.0","0","12"
    4 "NZ","101.0","2035","5129"
    5 "DE","40.0","1","208"
    6 "AU","36.0","164","313"
    7 "FR","35.0","22","244"
    8 "NL","24.0","127","265"
    9 "GB","13.0","1","42"
    10 "CA","12.0","1","209"
    11 "DK","8.0","4","8"
    12 "ES","7.0","1","50"
    13 "CZ","6.0","0","32"
    14 "JP","5.0","0","100"
    15 "IT","4.0","0","18"
    16 "RO","3.0","73","116"
    17 "IE","3.0","1","3"
    18 "RU","3.0","0","1"
    19 "AT","3.0","0","61"
    20 "IN","3.0","0","0"
    21 "SE","3.0","0","0"
    22 "IL","3.0","0","19"
    23 "CH","3.0","0","4"
    24 "CN","2.0","0","34"
    25 "PL","2.0","0","0"
    26 "CK","2.0","0","0"
    27 "IO","1.0","0","0"
    28 "SG","1.0","3","10"
    29 "FI","1.0","0","29"
    30 "UA","1.0","0","0"
    31 "IR","1.0","0","1"
    32 "ZA","1.0","0","0"
    33 "MX","1.0","1","21"
    34 "PT","1.0","0","0"
    35 "GR","1.0","1","3"
    36 "PF","1.0","0","0"
    37 "ME","1.0","0","0"
    38 "BG","1.0","2","2"
     1"countryCode","count","numPagesInMRICount","numPagesContainingMRICount","totalPagesAcrossMatchingSites"
     2"US","288.0","2136","5452","213005"
     3"UNKNOWN","154.0","0","12","78"
     4"NZ","101.0","2035","5129","31360"
     5"DE","40.0","1","208","30046"
     6"AU","36.0","164","313","10490"
     7"FR","35.0","22","244","29152"
     8"NL","24.0","127","265","10918"
     9"GB","13.0","1","42","2751"
     10"CA","12.0","1","209","11931"
     11"DK","8.0","4","8","16"
     12"ES","7.0","1","50","2671"
     13"CZ","6.0","0","32","9969"
     14"JP","5.0","0","100","1005"
     15"IT","4.0","0","18","684"
     16"SE","3.0","0","0","3"
     17"IN","3.0","0","0","110"
     18"AT","3.0","0","61","984"
     19"RU","3.0","0","1","1994"
     20"IE","3.0","1","3","6"
     21"RO","3.0","73","116","240"
     22"IL","3.0","0","19","641"
     23"CH","3.0","0","4","305"
     24"PL","2.0","0","0","4671"
     25"CN","2.0","0","34","716"
     26"CK","2.0","0","0","2"
     27"PT","1.0","0","0","2852"
     28"MX","1.0","1","21","236"
     29"IR","1.0","0","1","5"
     30"ZA","1.0","0","0","1"
     31"FI","1.0","0","29","1124"
     32"UA","1.0","0","0","294"
     33"SG","1.0","3","10","23"
     34"IO","1.0","0","0","2"
     35"BG","1.0","2","2","2"
     36"ME","1.0","0","0","1"
     37"PF","1.0","0","0","2"
     38"GR","1.0","1","3","4"
     39
  • other-projects/maori-lang-detection/mongodb-data/1table_allCrawledSites.csv

    r33848 r33889  
    1 "_id","count","numPagesInMRICount","numPagesContainingMRICount"
    2 "US","696.0","3305","9324"
    3 "UNKNOWN","173.0","115","137"
    4 "CN","125.0","281","1178"
    5 "NZ","115.0","2653","6226"
    6 "FR","69.0","776","1335"
    7 "DE","52.0","146","420"
    8 "AU","43.0","183","430"
    9 "NL","32.0","203","380"
    10 "CA","19.0","30","328"
    11 "GB","18.0","4","49"
    12 "DK","10.0","4","8"
    13 "JP","10.0","1","103"
    14 "ES","8.0","5","54"
    15 "CZ","7.0","0","32"
    16 "RU","7.0","0","15"
    17 "HK","7.0","3","12"
    18 "UA","5.0","9","10"
    19 "IE","4.0","18","24"
    20 "IT","4.0","0","18"
    21 "SE","4.0","0","0"
    22 "RO","3.0","73","116"
    23 "SG","3.0","5","23"
    24 "AT","3.0","0","61"
    25 "IN","3.0","0","0"
    26 "IL","3.0","0","19"
    27 "CH","3.0","0","4"
    28 "PL","2.0","0","0"
    29 "ZA","2.0","0","0"
    30 "VG","2.0","0","0"
    31 "CK","2.0","0","0"
    32 "IO","1.0","0","0"
    33 "FI","1.0","0","29"
    34 "IR","1.0","0","1"
    35 "TR","1.0","0","2"
    36 "EU","1.0","0","7"
    37 "PT","1.0","0","0"
    38 "MX","1.0","1","21"
    39 "GR","1.0","1","3"
    40 "PF","1.0","0","0"
    41 "ME","1.0","0","0"
    42 "BG","1.0","2","2"
     1"countryCode","count","numPagesInMRICount","numPagesContainingMRICount","totalPagesAcrossSites"
     2"US","696.0","3305","9324","412492"
     3"UNKNOWN","173.0","115","137","1274"
     4"CN","125.0","281","1178","80292"
     5"NZ","115.0","2653","6226","37319"
     6"FR","69.0","776","1335","37135"
     7"DE","52.0","146","420","35764"
     8"AU","43.0","183","430","11039"
     9"NL","32.0","203","380","19269"
     10"CA","19.0","30","328","23508"
     11"GB","18.0","4","49","4699"
     12"JP","10.0","1","103","1425"
     13"DK","10.0","4","8","40"
     14"ES","8.0","5","54","6319"
     15"RU","7.0","0","15","2024"
     16"HK","7.0","3","12","620"
     17"CZ","7.0","0","32","9970"
     18"UA","5.0","9","10","1438"
     19"SE","4.0","0","0","9"
     20"IT","4.0","0","18","684"
     21"IE","4.0","18","24","457"
     22"IN","3.0","0","0","110"
     23"SG","3.0","5","23","1396"
     24"AT","3.0","0","61","984"
     25"RO","3.0","73","116","240"
     26"IL","3.0","0","19","641"
     27"CH","3.0","0","4","305"
     28"VG","2.0","0","0","2"
     29"ZA","2.0","0","0","3"
     30"PL","2.0","0","0","4671"
     31"CK","2.0","0","0","2"
     32"PT","1.0","0","0","2852"
     33"IR","1.0","0","1","5"
     34"TR","1.0","0","2","505"
     35"MX","1.0","1","21","236"
     36"FI","1.0","0","29","1124"
     37"IO","1.0","0","0","2"
     38"EU","1.0","0","7","250"
     39"BG","1.0","2","2","2"
     40"ME","1.0","0","0","1"
     41"PF","1.0","0","0","2"
     42"GR","1.0","1","3","4"
     43
  • other-projects/maori-lang-detection/mongodb-data/2table_sitesWithPagesInMRI.csv

    r33886 r33889  
    1 "_id","count","numPagesInMRICount","numPagesContainingMRICount"
    2 "us","206.0","3305","6327"
    3 "nz","53.0","2653","5045"
    4 "cn","32.0","281","542"
    5 "fr","18.0","776","1101"
    6 "au","11.0","183","358"
    7 "nl","10.0","203","216"
    8 "de","5.0","146","190"
    9 "dk","4.0","4","4"
    10 "gb","3.0","4","13"
    11 "ca","3.0","30","35"
    12 "unknown","2.0","115","125"
    13 "ie","2.0","18","24"
    14 "ua","2.0","9","10"
    15 "es","2.0","5","5"
    16 "sg","2.0","5","23"
    17 "jp","1.0","1","3"
    18 "hk","1.0","3","8"
    19 "ro","1.0","73","104"
    20 "bg","1.0","2","2"
    21 "gr","1.0","1","3"
    22 "mx","1.0","1","21"
     1"countryCode","count","numPagesInMRICount","numPagesContainingMRICount","totalPagesAcrossSitesWithPositiveMRICount"
     2"us","206.0","3305","6327","174620"
     3"nz","53.0","2653","5045","21901"
     4"cn","32.0","281","542","27405"
     5"fr","18.0","776","1101","17622"
     6"au","11.0","183","358","1329"
     7"nl","10.0","203","216","9185"
     8"de","5.0","146","190","4195"
     9"dk","4.0","4","4","10"
     10"gb","3.0","4","13","2935"
     11"ca","3.0","30","35","2823"
     12"es","2.0","5","5","3649"
     13"ua","2.0","9","10","1140"
     14"sg","2.0","5","23","1357"
     15"ie","2.0","18","24","454"
     16"unknown","2.0","115","125","943"
     17"hk","1.0","3","8","500"
     18"jp","1.0","1","3","377"
     19"gr","1.0","1","3","4"
     20"mx","1.0","1","21","236"
     21"ro","1.0","73","104","105"
     22"bg","1.0","2","2","2"
     23
  • other-projects/maori-lang-detection/mongodb-data/3table_sitesWithPagesContainingMRI.csv

    r33848 r33889  
    1 "_id","count","numPagesInMRICount","numPagesContainingMRICount"
    2 "us","486.0","3305","9324"
    3 "cn","114.0","281","1178"
    4 "nz","89.0","2653","6226"
    5 "fr","36.0","776","1335"
    6 "de","27.0","146","420"
    7 "nl","22.0","203","380"
    8 "au","21.0","183","430"
    9 "ca","12.0","30","328"
    10 "dk","8.0","4","8"
    11 "gb","7.0","4","49"
    12 "es","7.0","5","54"
    13 "cz","4.0","0","32"
    14 "unknown","3.0","115","137"
    15 "ro","3.0","73","116"
    16 "it","3.0","0","18"
    17 "at","3.0","0","61"
    18 "il","2.0","0","19"
    19 "ua","2.0","9","10"
    20 "ru","2.0","0","15"
    21 "ch","2.0","0","4"
    22 "hk","2.0","3","12"
    23 "sg","2.0","5","23"
    24 "ie","2.0","18","24"
    25 "jp","2.0","1","103"
    26 "tr","1.0","0","2"
    27 "mx","1.0","1","21"
    28 "fi","1.0","0","29"
    29 "eu","1.0","0","7"
    30 "gr","1.0","1","3"
    31 "bg","1.0","2","2"
    32 "ir","1.0","0","1"
     1"countryCode","count","numPagesInMRICount","numPagesContainingMRICount","totalPagesAcrossSitesWithPositiveContainsMRI"
     2"us","486.0","3305","9324","353593"
     3"cn","114.0","281","1178","78881"
     4"nz","89.0","2653","6226","36874"
     5"fr","36.0","776","1335","33647"
     6"de","27.0","146","420","31884"
     7"nl","22.0","203","380","16016"
     8"au","21.0","183","430","5013"
     9"ca","12.0","30","328","13795"
     10"dk","8.0","4","8","16"
     11"gb","7.0","4","49","3736"
     12"es","7.0","5","54","6318"
     13"cz","4.0","0","32","4698"
     14"it","3.0","0","18","396"
     15"unknown","3.0","115","137","1016"
     16"ro","3.0","73","116","240"
     17"at","3.0","0","61","984"
     18"ua","2.0","9","10","1140"
     19"ru","2.0","0","15","923"
     20"il","2.0","0","19","639"
     21"hk","2.0","3","12","600"
     22"sg","2.0","5","23","1357"
     23"ie","2.0","18","24","454"
     24"jp","2.0","1","103","966"
     25"ch","2.0","0","4","304"
     26"fi","1.0","0","29","1124"
     27"tr","1.0","0","2","505"
     28"mx","1.0","1","21","236"
     29"eu","1.0","0","7","250"
     30"gr","1.0","1","3","4"
     31"bg","1.0","2","2","2"
     32"ir","1.0","0","1","5"
     33
  • other-projects/maori-lang-detection/mongodb-data/4table_tentativeNonProductSites.csv

    r33848 r33889  
    1 "_id","siteCount","numPagesInMRICount","numPagesContainingMRICount"
    2 "us","181.0","2212","5579"
    3 "nz","89.0","2653","6226"
    4 "au","21.0","183","430"
    5 "de","19.0","1","208"
    6 "fr","17.0","22","244"
    7 "nl","16.0","127","265"
    8 "dk","8.0","4","8"
    9 "ca","7.0","1","209"
    10 "es","6.0","1","50"
    11 "gb","5.0","1","42"
    12 "cz","4.0","0","32"
    13 "at","3.0","0","61"
    14 "it","3.0","0","18"
    15 "ro","3.0","73","116"
    16 "ch","2.0","0","4"
    17 "il","2.0","0","19"
    18 "ru","1.0","0","1"
    19 "jp","1.0","0","100"
    20 "unknown","1.0","0","12"
    21 "ie","1.0","1","3"
    22 "fi","1.0","0","29"
    23 "sg","1.0","3","10"
    24 "bg","1.0","2","2"
    25 "cn","1.0","0","34"
    26 "gr","1.0","1","3"
    27 "ir","1.0","0","1"
    28 "mx","1.0","1","21"
     1"countryCode","count","numPagesInMRICount","numPagesContainingMRICount","totalPagesAcrossMatchingSites"
     2"us","181.0","2212","5579","158656"
     3"nz","89.0","2653","6226","36874"
     4"au","21.0","183","430","5013"
     5"de","19.0","1","208","26617"
     6"fr","17.0","22","244","25705"
     7"nl","16.0","127","265","7669"
     8"dk","8.0","4","8","16"
     9"ca","7.0","1","209","2228"
     10"es","6.0","1","50","2670"
     11"gb","5.0","1","42","1813"
     12"cz","4.0","0","32","4698"
     13"ro","3.0","73","116","240"
     14"it","3.0","0","18","396"
     15"at","3.0","0","61","984"
     16"il","2.0","0","19","639"
     17"ch","2.0","0","4","304"
     18"sg","1.0","3","10","23"
     19"ir","1.0","0","1","5"
     20"fi","1.0","0","29","1124"
     21"ie","1.0","1","3","3"
     22"ru","1.0","0","1","909"
     23"jp","1.0","0","100","589"
     24"mx","1.0","1","21","236"
     25"unknown","1.0","0","12","73"
     26"gr","1.0","1","3","4"
     27"bg","1.0","2","2","2"
     28"cn","1.0","0","34","270"
     29
  • other-projects/maori-lang-detection/mongodb-data/5table_tentativeNonProductSites1.csv

    r33883 r33889  
    1 "_id","siteCount (numPagesContainingMRICount > 0)","numPagesInMRICount","numPagesContainingMRICount"
    2 "nz","176.0","4360","9641"
    3 "us","117.0","757","2655"
    4 "de","19.0","1","208"
    5 "nl","16.0","127","265"
    6 "fr","16.0","22","243"
    7 "dk","8.0","4","8"
    8 "ca","7.0","1","209"
    9 "au","5.0","8","102"
    10 "cz","4.0","0","32"
    11 "gb","4.0","1","40"
    12 "es","4.0","1","7"
    13 "it","3.0","0","18"
    14 "at","3.0","0","61"
    15 "ro","2.0","0","12"
    16 "il","2.0","0","19"
    17 "ch","2.0","0","4"
    18 "ir","1.0","0","1"
    19 "fi","1.0","0","29"
    20 "ie","1.0","1","3"
    21 "ru","1.0","0","1"
    22 "jp","1.0","0","100"
    23 "mx","1.0","1","21"
    24 "unknown","1.0","0","12"
    25 "bg","1.0","2","2"
    26 "cn","1.0","0","34"
     1"countryCode","count","numPagesInMRICount","numPagesContainingMRICount","totalPagesAcrossMatchingSites"
     2"nz","176.0","4360","9641","87657"
     3"us","117.0","757","2655","113936"
     4"de","19.0","1","208","26617"
     5"fr","16.0","22","243","25547"
     6"nl","16.0","127","265","7669"
     7"dk","8.0","4","8","16"
     8"ca","7.0","1","209","2228"
     9"au","5.0","8","102","560"
     10"gb","4.0","1","40","1809"
     11"es","4.0","1","7","1354"
     12"cz","4.0","0","32","4698"
     13"it","3.0","0","18","396"
     14"at","3.0","0","61","984"
     15"ro","2.0","0","12","135"
     16"il","2.0","0","19","639"
     17"ch","2.0","0","4","304"
     18"fi","1.0","0","29","1124"
     19"ie","1.0","1","3","3"
     20"jp","1.0","0","100","589"
     21"ru","1.0","0","1","909"
     22"unknown","1.0","0","12","73"
     23"mx","1.0","1","21","236"
     24"bg","1.0","2","2","2"
     25"cn","1.0","0","34","270"
     26"ir","1.0","0","1","5"
     27
  • other-projects/maori-lang-detection/mongodb-data/tables.txt

    r33878 r33889  
    11Instructions for producing the tables:
    22a. Copy the Javascript version of results for each mongodb query listed below into a text editor.
    3 b. Then regex replace \/\*\s*\d+\s*\*\/ with "," and embed all the JS inside [].
     3b. OPTIONAL: Then regex replace \/\*\s*\d+\s*\*\/ with a comma (','), remove the very first comma, and embed all the JS inside [].
    44c. Paste that Javascript into https://json-csv.com/ to get the CSV tables
    55
     
    1717            /*domain: { $addToSet: '$domain' },*/
    1818            numPagesInMRICount: { $sum: '$numPagesInMRI' },
    19             numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' }
     19            numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
     20            totalPagesAcrossSites: { $sum: '$totalPages'}
    2021        }
    2122    },
     
    3536            /*domain: { $addToSet: '$domain' },*/
    3637            numPagesInMRICount: { $sum: '$numPagesInMRI' },
    37             numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' }
     38            numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
     39            totalPagesAcrossMatchingSites: { $sum: '$totalPages'}
    3840        }
    3941    },
     
    5355            /*domain: { $addToSet: '$domain' },*/
    5456            numPagesInMRICount: { $sum: '$numPagesInMRI' },
    55             numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' }
     57            numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
     58            totalPagesAcrossMatchingSites: { $sum: '$totalPages'}
    5659        }
    5760    },
     
    7578            /*domain: { $addToSet: '$domain' },*/
    7679            numPagesInMRICount: { $sum: '$numPagesInMRI' },
    77             numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' }
     80            numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
     81            totalPagesAcrossSitesWithPositiveMRICount: { $sum: '$totalPages'}
    7882        }
    7983    },
     
    97101            /*domain: { $addToSet: '$domain' },*/
    98102            numPagesInMRICount: { $sum: '$numPagesInMRI' },
    99             numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' }
     103            numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
     104            totalPagesAcrossSitesWithPositiveContainsMRI: { $sum: '$totalPages'}
    100105        }
    101106    },
     
    122127             /*domain: { $addToSet: '$domain' },*/
    123128            numPagesInMRICount: { $sum: '$numPagesInMRI' },
    124             numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' }
     129            numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
     130            totalPagesAcrossMatchingSites: { $sum: '$totalPages'}
    125131        }
    126132    },
     
    151157            /*domain: { $addToSet: '$domain' },*/
    152158            numPagesInMRICount: { $sum: '$numPagesInMRI' },
    153             numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' }
     159            numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
     160            totalPagesAcrossMatchingSites: { $sum: '$totalPages'}
    154161        }
    155162    },
     
    175182            /*domain: { $addToSet: '$domain' },*/
    176183            numPagesInMRICount: { $sum: '$numPagesInMRI' },
    177             numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' }
    178         }
    179     },
    180     { $sort : { count : -1} }
    181 ]);
    182 
    183 
    184 To find NZ web pages in MRI the following may be BETTER,
     184            numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
     185            totalPagesAcrossMatchingSites: { $sum: '$totalPages'}
     186        }
     187    },
     188    { $sort : { count : -1} }
     189]);
     190
     191
     192To find NZ web pages IN MRI the following may be BETTER,
    185193as it looks for sites with positive numPagesINMRI rather than sites that only have positive containingMRI:
    186194
     
    201209            domain: { $addToSet: '$domain' },
    202210            numPagesInMRICount: { $sum: '$numPagesInMRI' },
    203             numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' }
    204         }
    205     },
    206     { $sort : { count : -1} }
    207 ]);
    208 
     211            numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
     212            totalPagesAcrossMatchingSites: { $sum: '$totalPages'}
     213        }
     214    },
     215    { $sort : { count : -1} }
     216]);
     217
Note: See TracChangeset for help on using the changeset viewer.