source: gs3-extensions/maori-lang-detection/bin/script/get_commoncrawl_nz_urls.sh@ 33393

Last change on this file since 33393 was 33393, checked in by ak19, 5 years ago

Modified the get_commoncrawl_nz_urls.sh to also create a reduced urls file of just the unique toplevel sites

  • Property svn:executable set to *
File size: 6.5 KB
Line 
1#!/bin/bash
2
3#####################################################################################################
4# This script follows the instructions at https://groups.google.com/d/msg/common-crawl/3QmQjFA_3y4/vTbhGqIBBQAJ
5# to download all the .nz TLD URLs that CommonCrawl has crawled for a given date.
6# It needs to be provided with either a cc-index.paths.gz file for a particular date or
7# the URL for such a file, both obtainable from http://index.commoncrawl.org/
8#####################################################################################################
9
10
11function printUsage() {
12 echo ""
13 echo "Usage: $0 <cc-index.paths.gz | URL to a cc-index.paths.gz file, e.g. https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-30/cc-index.paths.gz>"
14 echo " You can download such a file or copy its URL from http://index.commoncrawl.org/"
15 echo ""
16}
17
18
19
20# Need to run this script from the top level folder of this extension
21# This script needs to be provided with a cc-index.paths.gz downloaded from http://index.commoncrawl.org/
22# or a URL of the form https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-30/cc-index.paths.gz
23if [ -z $1 ]; then
24 printUsage
25 exit 1
26fi
27
28# now we have a $1
29if [[ "x$1" = "x--help" || "x$1" = "x-h" ]]; then
30 printUsage
31 exit 0
32fi
33
34# We'll create a temporary directory to work in and tell the user to delete it after we're done
35mkdir -p tmp
36
37# some sanity checking
38infile=$1
39# if $1 contains the correct format for the URL to the cc-index.paths.gz, then wget it and make that file the
40if [[ $infile == https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-*/cc-index.paths.gz ]]; then
41 wget $infile
42
43 if [ ! -f cc-index.paths.gz ]; then
44 echo "Downloading of $infile failed."
45 exit -1
46 fi
47
48 mv cc-index.paths.gz tmp/.
49 infile=tmp/cc-index.paths.gz
50fi
51
52if [[ $infile != *cc-index.paths.gz ]]; then
53 echo "*** Invalid parameter: This script can only work with the cc-index.paths.gz file or URL to such a file"
54 echo ""
55 printUsage
56 exit -1
57fi
58
59# decompress. gunzip removes the .gz archive file and just leaves the decompressed version (same filename, without .gz)
60# So infile hereafter is without the .gz suffix, https://wiki.bash-hackers.org/syntax/pe#substring_removal
61gunzip $infile
62infile=${infile%.gz}
63#echo "Finished extracting $infile from the download"
64
65aws_prefix=https://commoncrawl.s3.amazonaws.com/
66cluster_url=
67# read in file line-by-line in reverse (cat-reversed tac instead of cat), and store the cluster.idx line
68# http://forums.codeguru.com/showthread.php?472039-RESOLVED-BASH-read-from-end-of-file-upwards
69
70#tac $infile | while read line; do
71# if [[ $line = *cluster.idx ]]; then
72# echo "Found cluster.idx at: $line"
73# cluster_url=$aws_prefix$line
74# echo "Will download cluster.idx file from $cluster_url"
75# fi
76#done
77
78# Above way is bad because we can't BREAK out of the outer process (tac, reading each line in) when we've found the line we want.
79# Instead, do it the following way: to read until we found the line of interest so we can then BREAK out of the loop properly/cleanly:
80# https://unix.stackexchange.com/questions/166546/bash-cannot-break-out-of-piped-while-read-loop-process-substitution-works
81while read line; do
82 if [[ $line = *cluster.idx ]]; then
83 #echo "Found cluster.idx mentioned at: $line"
84 cluster_url=$aws_prefix$line
85 echo "Will download the cluster.idx file from $cluster_url"
86 echo ""
87 break
88 fi
89done < <(tac $infile)
90
91# make sure we have a cluster.idx URL
92if [ -z $1 ]; then
93 echo "Could not find the cluster.idx listed in $infile"
94 echo ""
95 exit -1
96fi
97
98# Work in the tmp folder for a while
99pushd tmp
100
101# Now let's get cluster.idx
102wget $cluster_url
103
104if [ ! -f cluster.idx ]; then
105 echo "Downloading of $cluster_url failed."
106 echo ""
107 exit -1
108fi
109
110# Hereafter, the amazon prefix is the parent path of the full URL to cluster.idx
111aws_prefix=${cluster_url%cluster.idx}
112#echo "New prefix: $aws_prefix"
113
114outfile=nz-only-TLDs-`date +%F`.txt
115touch $outfile
116
117while read line; do
118 urls_file=$line
119 echo "Found .nz TLD mentioned in: $urls_file (located at in $aws_prefix). Will wget and extract."
120 echo ""
121
122 # get the large file (under 1GB)
123 wget "$aws_prefix$urls_file"
124 # Unzip each, and we have all URLs with TLD .nz, but perhaps also .no for Norway in the first and .org in the last
125 # We just want the nz domains.
126
127 if [ ! -f $urls_file ]; then
128 echo "ERROR: Downloading of $urls_file containing .nz specific URLs failed."
129 echo ""
130 break
131 # Let's not exit, but go around the loop trying to download any other gz files also mentioning the .nz TLD
132 else
133 # Extract the huge file
134 echo "Extracting..."
135 echo " (This may take a minute or so, please be patient.)"
136 gunzip $urls_file
137 echo ""
138
139 echo "Reducing the downloaded URLs to just those with toplevel domain .nz..."
140 echo ""
141
142 # Extract just those URLs that start with "^nz," (since the SURT form of URLs, the form starting with the TLD) come first on each line
143 # and append to $outfile of all .nz TLDs for the specific crawl
144 # See https://www.unix.com/shell-programming-and-scripting/176608-how-copy-lines-starts-either-3-4-into-new-file.html
145 urls_file=${urls_file%.gz}
146 egrep "^nz," $urls_file >> $outfile
147 fi
148done < <(grep '^nz,' cluster.idx | cut -f2 | uniq)
149
150mv $outfile ../.
151# get out of the tmp dir
152popd
153
154echo ""
155echo "The file $outfile has now been created, containing all the .nz domains for the crawl of $1"
156echo ""
157
158uniq_urls_file=uniq-tld-nz-urls-`date +%F`.txt
159echo "Creating file $uniq_urls_file containing just the unique domains and subdomains..."
160cat $outfile | cut -d ' ' -f4 | cut -d/ -f3 | sed -r 's@\.?",\s*$@@' | sed -r 's@^www\.@@' | uniq > $uniq_urls_file
161
162# The first cut grabs the url field of the json.
163# The second cut grabs the domain name from the url (located between first // and immediately subsequent /).
164# The first sed process then removes any trailing . (e.g. "massey.ac.nz." becomes "massey.ac.nz") followed by ", and optional spaces before the end,
165# and the final sed removes any "www." prefix.
166# Then we get the uniq urls out of all that.
167
168echo "File $uniq_urls_file containing just the unique .nz sites (domains and subdomains) to be used as seed urls has now been created."
169num_uniq_urls=`wc -l $uniq_urls_file`
170total_urls=`wc -l $outfile`
171echo ""
172echo ""
173echo "Summary:"
174echo "There were $num_uniq_urls unique sites"
175echo "out of a total of $total_urls urls in $outfile."
176echo ""
177echo ""
178
179echo ""
180echo "NEXT:"
181echo "Remember to delete the products in the tmp folder or the folder itself after inspecting its contents."
182echo ""
183exit 0
Note: See TracBrowser for help on using the repository browser.