1 | #!/bin/bash
|
---|
2 |
|
---|
3 | #####################################################################################################
|
---|
4 | # This script follows the instructions at https://groups.google.com/d/msg/common-crawl/3QmQjFA_3y4/vTbhGqIBBQAJ
|
---|
5 | # to download all the .nz TLD URLs that CommonCrawl has crawled for a given date.
|
---|
6 | # It needs to be provided with either a cc-index.paths.gz file for a particular date or
|
---|
7 | # the URL for such a file, both obtainable from http://index.commoncrawl.org/
|
---|
8 | #####################################################################################################
|
---|
9 |
|
---|
10 |
|
---|
11 | function printUsage() {
|
---|
12 | echo ""
|
---|
13 | echo "Usage: $0 <cc-index.paths.gz | URL to a cc-index.paths.gz file, e.g. https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-30/cc-index.paths.gz>"
|
---|
14 | echo " You can download such a file or copy its URL from http://index.commoncrawl.org/"
|
---|
15 | echo ""
|
---|
16 | }
|
---|
17 |
|
---|
18 |
|
---|
19 |
|
---|
20 | # Need to run this script from the top level folder of this extension
|
---|
21 | # This script needs to be provided with a cc-index.paths.gz downloaded from http://index.commoncrawl.org/
|
---|
22 | # or a URL of the form https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-30/cc-index.paths.gz
|
---|
23 | if [ -z $1 ]; then
|
---|
24 | printUsage
|
---|
25 | exit 1
|
---|
26 | fi
|
---|
27 |
|
---|
28 | # now we have a $1
|
---|
29 | if [[ "x$1" = "x--help" || "x$1" = "x-h" ]]; then
|
---|
30 | printUsage
|
---|
31 | exit 0
|
---|
32 | fi
|
---|
33 |
|
---|
34 | # We'll create a temporary directory to work in and tell the user to delete it after we're done
|
---|
35 | mkdir -p tmp
|
---|
36 |
|
---|
37 | # some sanity checking
|
---|
38 | infile=$1
|
---|
39 | # if $1 contains the correct format for the URL to the cc-index.paths.gz, then wget it and make that file the
|
---|
40 | if [[ $infile == https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-*/cc-index.paths.gz ]]; then
|
---|
41 | wget $infile
|
---|
42 |
|
---|
43 | if [ ! -f cc-index.paths.gz ]; then
|
---|
44 | echo "Downloading of $infile failed."
|
---|
45 | exit -1
|
---|
46 | fi
|
---|
47 |
|
---|
48 | mv cc-index.paths.gz tmp/.
|
---|
49 | infile=tmp/cc-index.paths.gz
|
---|
50 | fi
|
---|
51 |
|
---|
52 | if [[ $infile != *cc-index.paths.gz ]]; then
|
---|
53 | echo "*** Invalid parameter: This script can only work with the cc-index.paths.gz file or URL to such a file"
|
---|
54 | echo ""
|
---|
55 | printUsage
|
---|
56 | exit -1
|
---|
57 | fi
|
---|
58 |
|
---|
59 | # decompress. gunzip removes the .gz archive file and just leaves the decompressed version (same filename, without .gz)
|
---|
60 | # So infile hereafter is without the .gz suffix, https://wiki.bash-hackers.org/syntax/pe#substring_removal
|
---|
61 | gunzip $infile
|
---|
62 | infile=${infile%.gz}
|
---|
63 | #echo "Finished extracting $infile from the download"
|
---|
64 |
|
---|
65 | aws_prefix=https://commoncrawl.s3.amazonaws.com/
|
---|
66 | cluster_url=
|
---|
67 | # read in file line-by-line in reverse (cat-reversed tac instead of cat), and store the cluster.idx line
|
---|
68 | # http://forums.codeguru.com/showthread.php?472039-RESOLVED-BASH-read-from-end-of-file-upwards
|
---|
69 |
|
---|
70 | #tac $infile | while read line; do
|
---|
71 | # if [[ $line = *cluster.idx ]]; then
|
---|
72 | # echo "Found cluster.idx at: $line"
|
---|
73 | # cluster_url=$aws_prefix$line
|
---|
74 | # echo "Will download cluster.idx file from $cluster_url"
|
---|
75 | # fi
|
---|
76 | #done
|
---|
77 |
|
---|
78 | # Above way is bad because we can't BREAK out of the outer process (tac, reading each line in) when we've found the line we want.
|
---|
79 | # Instead, do it the following way: to read until we found the line of interest so we can then BREAK out of the loop properly/cleanly:
|
---|
80 | # https://unix.stackexchange.com/questions/166546/bash-cannot-break-out-of-piped-while-read-loop-process-substitution-works
|
---|
81 | while read line; do
|
---|
82 | if [[ $line = *cluster.idx ]]; then
|
---|
83 | #echo "Found cluster.idx mentioned at: $line"
|
---|
84 | cluster_url=$aws_prefix$line
|
---|
85 | echo "Will download the cluster.idx file from $cluster_url"
|
---|
86 | echo ""
|
---|
87 | break
|
---|
88 | fi
|
---|
89 | done < <(tac $infile)
|
---|
90 |
|
---|
91 | # make sure we have a cluster.idx URL
|
---|
92 | if [ -z $1 ]; then
|
---|
93 | echo "Could not find the cluster.idx listed in $infile"
|
---|
94 | echo ""
|
---|
95 | exit -1
|
---|
96 | fi
|
---|
97 |
|
---|
98 | # Work in the tmp folder for a while
|
---|
99 | pushd tmp
|
---|
100 |
|
---|
101 | # Now let's get cluster.idx
|
---|
102 | wget $cluster_url
|
---|
103 |
|
---|
104 | if [ ! -f cluster.idx ]; then
|
---|
105 | echo "Downloading of $cluster_url failed."
|
---|
106 | echo ""
|
---|
107 | exit -1
|
---|
108 | fi
|
---|
109 |
|
---|
110 | # Hereafter, the amazon prefix is the parent path of the full URL to cluster.idx
|
---|
111 | aws_prefix=${cluster_url%cluster.idx}
|
---|
112 | #echo "New prefix: $aws_prefix"
|
---|
113 |
|
---|
114 | outfile=nz-only-TLDs-`date +%F`.txt
|
---|
115 | touch $outfile
|
---|
116 |
|
---|
117 | while read line; do
|
---|
118 | urls_file=$line
|
---|
119 | echo "Found .nz TLD mentioned in: $urls_file (located at in $aws_prefix). Will wget and extract."
|
---|
120 | echo ""
|
---|
121 |
|
---|
122 | # get the large file (under 1GB)
|
---|
123 | wget "$aws_prefix$urls_file"
|
---|
124 | # Unzip each, and we have all URLs with TLD .nz, but perhaps also .no for Norway in the first and .org in the last
|
---|
125 | # We just want the nz domains.
|
---|
126 |
|
---|
127 | if [ ! -f $urls_file ]; then
|
---|
128 | echo "ERROR: Downloading of $urls_file containing .nz specific URLs failed."
|
---|
129 | echo ""
|
---|
130 | break
|
---|
131 | # Let's not exit, but go around the loop trying to download any other gz files also mentioning the .nz TLD
|
---|
132 | else
|
---|
133 | # Extract the huge file
|
---|
134 | echo "Extracting..."
|
---|
135 | echo " (This may take a minute or so, please be patient.)"
|
---|
136 | gunzip $urls_file
|
---|
137 | echo ""
|
---|
138 |
|
---|
139 | echo "Reducing the downloaded URLs to just those with toplevel domain .nz..."
|
---|
140 | echo ""
|
---|
141 |
|
---|
142 | # Extract just those URLs that start with "^nz," (since the SURT form of URLs, the form starting with the TLD) come first on each line
|
---|
143 | # and append to $outfile of all .nz TLDs for the specific crawl
|
---|
144 | # See https://www.unix.com/shell-programming-and-scripting/176608-how-copy-lines-starts-either-3-4-into-new-file.html
|
---|
145 | urls_file=${urls_file%.gz}
|
---|
146 | egrep "^nz," $urls_file >> $outfile
|
---|
147 | fi
|
---|
148 | done < <(grep '^nz,' cluster.idx | cut -f2 | uniq)
|
---|
149 |
|
---|
150 | mv $outfile ../.
|
---|
151 | # get out of the tmp dir
|
---|
152 | popd
|
---|
153 |
|
---|
154 | echo ""
|
---|
155 | echo "The file $outfile has now been created, containing all the .nz domains for the crawl of $1"
|
---|
156 | echo "Remember to delete the products in the tmp folder or the folder itself after inspecting its contents"
|
---|
157 | echo ""
|
---|
158 | exit 0
|
---|