1 | #!/bin/bash
|
---|
2 |
|
---|
3 | # Need to run this script from the top level folder of this extension
|
---|
4 | # This script needs to be provided with the nz-only-TLDs-<date>.txt file produced by running get_commoncrawl_nz_urls.sh
|
---|
5 | if [ -z $1 ]; then
|
---|
6 | echo ""
|
---|
7 | echo "Usage: $0 nz-only-TLDs-<date>.txt"
|
---|
8 | echo " where the nz-only-TLDs-<date>.txt file is produced by running get_commoncrawl_nz_urls.sh."
|
---|
9 | echo ""
|
---|
10 | exit 1
|
---|
11 | fi
|
---|
12 |
|
---|
13 | uniq_urls_file=uniq-tld-nz-WET-urls-`date +%F`.txt
|
---|
14 | echo "Creating file $uniq_urls_file containing just the unique WET gz urls for the .nz TLD..."
|
---|
15 |
|
---|
16 | # cat is unnecessary: https://stackoverflow.com/questions/1915636/is-there-a-way-to-uniq-by-column/1915750
|
---|
17 | # "You can dump the cat! Rather than piping into <the subsequent process>, just let <the subsequent process> read the file using <. Piping through cat is a common unnecessary complication used by novices. For large amounts of data there's a performance effect to be had."
|
---|
18 |
|
---|
19 | #<$1 cut -d ' ' -f18 | sed -r 's@\.?"(,|})\s*$@@' | sed -r 's@^\s*"@@' > $uniq_urls_file
|
---|
20 | #sort -u $uniq_urls_file > tmp.txt
|
---|
21 |
|
---|
22 | # The first cut grabs the filename field of the json, containing the relative path to the WET file
|
---|
23 | # The sed operations remove any starting and terminating double quotes followed by , or }
|
---|
24 | # Then we get the uniq urls out of all that.
|
---|
25 |
|
---|
26 | # The above is imperfect because space is not a proper field separator (some json property values
|
---|
27 | # contain spaces)
|
---|
28 | # https://unix.stackexchange.com/questions/155952/should-i-use-piping-or-redirection-for-input-to-sort
|
---|
29 | tmpfile=tmp_WET_urls.txt
|
---|
30 |
|
---|
31 | <$1 sed -r 's@^.*"filename":\s*"@@' | sed -r 's@\.?"(,|}).*$@@' > $tmpfile
|
---|
32 | sort -u $tmpfile > sorted_WET_urls.txt
|
---|
33 |
|
---|
34 | rm $tmpfile
|
---|
35 |
|
---|
36 | # remove all lines containing paths like /crawldiagnostics/ and /robotstxt/
|
---|
37 | # https://stackoverflow.com/questions/17049962/delete-line-containing-one-of-multiple-strings
|
---|
38 |
|
---|
39 | # And convert links like
|
---|
40 | # https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-30/segments/1563195523840.34/warc/CC-MAIN-20190715175205-20190715200159-00000.warc.gz
|
---|
41 | # to:
|
---|
42 | # https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-30/segments/1563195523840.34/wet/CC-MAIN-20190715175205-20190715200159-00000.warc.wet.gz
|
---|
43 |
|
---|
44 | awk '!/crawldiagnostics|robotstxt/' sorted_WET_urls.txt | sed -r 's@\/warc\/CC-MAIN-@/wet/CC-MAIN-@' | sed -r 's@\.warc\[email protected]@' > $uniq_urls_file
|
---|
45 |
|
---|
46 | rm sorted_WET_urls.txt
|
---|
47 |
|
---|
48 | # All urls need to be prefixed with:
|
---|
49 | #ccprefix=https://commoncrawl.s3.amazonaws.com/
|
---|
50 |
|
---|
51 |
|
---|
52 | # From each line of json of the input file, we want the filename value filed containing
|
---|
53 | # the relative path to the gzipped WET file.
|
---|
54 | # The first sed grabs everything after the filename field in each line/json
|
---|
55 | # The second sed operation remove any terminating double quotes followed by , or }
|
---|
56 | # Then we sort -u to get the uniq urls out of all that.
|
---|
57 |
|
---|
58 | echo "File $uniq_urls_file containing just the unique commoncrawl WET urls of the .nz TLD has now been created."
|
---|
59 | num_uniq_urls=`wc -l $uniq_urls_file`
|
---|
60 | total_urls=`wc -l $1`
|
---|
61 | echo ""
|
---|
62 | echo ""
|
---|
63 | echo "Summary:"
|
---|
64 | echo "There were $num_uniq_urls unique WET urls"
|
---|
65 | echo "out of a total of $total_urls urls in $1."
|
---|
66 | echo ""
|
---|
67 | echo ""
|
---|
68 |
|
---|
69 | exit 0
|
---|