Changeset 33413 for gs3-extensions
- Timestamp:
- 2019-08-13T21:57:42+12:00 (5 years ago)
- Location:
- gs3-extensions/maori-lang-detection/bin/script
- Files:
-
- 2 added
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/bin/script/get_commoncrawl_nz_urls.sh
r33394 r33413 156 156 echo "" 157 157 158 uniq_urls_file=uniq-tld-nz-urls-`date +%F`.txt159 echo "Creating file $uniq_urls_file containing just the unique domains and subdomains..."160 # cat $outfile | cut -d ' ' -f4 | cut -d/ -f3 | sed -r 's@\.?",\s*$@@' | sed -r 's@^www\.@@' | uniq > $uniq_urls_file161 <$outfile cut -d ' ' -f4 | cut -d/ -f3 | sed -r 's@\.?",\s*$@@' | sed -r 's@^www\.@@' | uniq > $uniq_urls_file158 # uniq_urls_file=uniq-tld-nz-urls-`date +%F`.txt 159 # echo "Creating file $uniq_urls_file containing just the unique domains and subdomains..." 160 # #cat $outfile | cut -d ' ' -f4 | cut -d/ -f3 | sed -r 's@\.?",\s*$@@' | sed -r 's@^www\.@@' | uniq > $uniq_urls_file 161 # <$outfile cut -d ' ' -f4 | cut -d/ -f3 | sed -r 's@\.?",\s*$@@' | sed -r 's@^www\.@@' | uniq > $uniq_urls_file 162 162 163 # cat is unnecessary: https://stackoverflow.com/questions/1915636/is-there-a-way-to-uniq-by-column/1915750164 # "You can dump the cat! Rather than piping into <the subsequent process>, just let <the subsequent process> read the file using <. Piping through cat is a common unnecessary complication used by novices. For large amounts of data there's a performance effect to be had."163 # # cat is unnecessary: https://stackoverflow.com/questions/1915636/is-there-a-way-to-uniq-by-column/1915750 164 # # "You can dump the cat! Rather than piping into <the subsequent process>, just let <the subsequent process> read the file using <. Piping through cat is a common unnecessary complication used by novices. For large amounts of data there's a performance effect to be had." 165 165 166 # The first cut grabs the url field of the json.167 # The second cut grabs the domain name from the url (located between first // and immediately subsequent /).168 # The first sed process then removes any trailing . (e.g. "massey.ac.nz." becomes "massey.ac.nz") followed by ", and optional spaces before the end,169 # and the final sed removes any "www." prefix.170 # Then we get the uniq urls out of all that.166 # # The first cut grabs the url field of the json. 167 # # The second cut grabs the domain name from the url (located between first // and immediately subsequent /). 168 # # The first sed process then removes any trailing . (e.g. "massey.ac.nz." becomes "massey.ac.nz") followed by ", and optional spaces before the end, 169 # # and the final sed removes any "www." prefix. 170 # # Then we get the uniq urls out of all that. 171 171 172 echo "File $uniq_urls_file containing just the unique .nz sites (domains and subdomains) to be used as seed urls has now been created."173 num_uniq_urls=`wc -l $uniq_urls_file`174 total_urls=`wc -l $outfile`175 echo ""176 echo ""177 echo "Summary:"178 echo "There were $num_uniq_urls unique sites"179 echo "out of a total of $total_urls urls in $outfile."180 echo ""181 echo ""172 # echo "File $uniq_urls_file containing just the unique .nz sites (domains and subdomains) to be used as seed urls has now been created." 173 # num_uniq_urls=`wc -l $uniq_urls_file` 174 # total_urls=`wc -l $outfile` 175 # echo "" 176 # echo "" 177 # echo "Summary:" 178 # echo "There were $num_uniq_urls unique sites" 179 # echo "out of a total of $total_urls urls in $outfile." 180 # echo "" 181 # echo "" 182 182 183 183 echo ""
Note:
See TracChangeset
for help on using the changeset viewer.