Context Navigation

source: gs3-extensions/maori-lang-detection/bin/script/get_commoncrawl_nz_urls.sh@ 33393

Last change on this file since 33393 was 33393, checked in by ak19, 5 years ago
Modified the get_commoncrawl_nz_urls.sh to also create a reduced urls file of just the unique toplevel sites
Property svn:executable set to ``*
File size: 6.5 KB

Line
1	#!/bin/bash
2
3	#####################################################################################################
4	# This script follows the instructions at https://groups.google.com/d/msg/common-crawl/3QmQjFA_3y4/vTbhGqIBBQAJ
5	# to download all the .nz TLD URLs that CommonCrawl has crawled for a given date.
6	# It needs to be provided with either a cc-index.paths.gz file for a particular date or
7	# the URL for such a file, both obtainable from http://index.commoncrawl.org/
8	#####################################################################################################
9
10
11	function printUsage() {
12	echo ""
13	echo "Usage: $0 <cc-index.paths.gz \| URL to a cc-index.paths.gz file, e.g. https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-30/cc-index.paths.gz>"
14	echo " You can download such a file or copy its URL from http://index.commoncrawl.org/"
15	echo ""
16	}
17
18
19
20	# Need to run this script from the top level folder of this extension
21	# This script needs to be provided with a cc-index.paths.gz downloaded from http://index.commoncrawl.org/
22	# or a URL of the form https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-30/cc-index.paths.gz
23	if [ -z $1 ]; then
24	printUsage
25	exit 1
26	fi
27
28	# now we have a $1
29	if [[ "x$1" = "x--help" \|\| "x$1" = "x-h" ]]; then
30	printUsage
31	exit 0
32	fi
33
34	# We'll create a temporary directory to work in and tell the user to delete it after we're done
35	mkdir -p tmp
36
37	# some sanity checking
38	infile=$1
39	# if $1 contains the correct format for the URL to the cc-index.paths.gz, then wget it and make that file the
40	if [[ $infile == https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-*/cc-index.paths.gz ]]; then
41	wget $infile
42
43	if [ ! -f cc-index.paths.gz ]; then
44	echo "Downloading of $infile failed."
45	exit -1
46	fi
47
48	mv cc-index.paths.gz tmp/.
49	infile=tmp/cc-index.paths.gz
50	fi
51
52	if [[ $infile != *cc-index.paths.gz ]]; then
53	echo "*** Invalid parameter: This script can only work with the cc-index.paths.gz file or URL to such a file"
54	echo ""
55	printUsage
56	exit -1
57	fi
58
59	# decompress. gunzip removes the .gz archive file and just leaves the decompressed version (same filename, without .gz)
60	# So infile hereafter is without the .gz suffix, https://wiki.bash-hackers.org/syntax/pe#substring_removal
61	gunzip $infile
62	infile=${infile%.gz}
63	#echo "Finished extracting $infile from the download"
64
65	aws_prefix=https://commoncrawl.s3.amazonaws.com/
66	cluster_url=
67	# read in file line-by-line in reverse (cat-reversed tac instead of cat), and store the cluster.idx line
68	# http://forums.codeguru.com/showthread.php?472039-RESOLVED-BASH-read-from-end-of-file-upwards
69
70	#tac $infile \| while read line; do
71	# if [[ $line = *cluster.idx ]]; then
72	# echo "Found cluster.idx at: $line"
73	# cluster_url=$aws_prefix$line
74	# echo "Will download cluster.idx file from $cluster_url"
75	# fi
76	#done
77
78	# Above way is bad because we can't BREAK out of the outer process (tac, reading each line in) when we've found the line we want.
79	# Instead, do it the following way: to read until we found the line of interest so we can then BREAK out of the loop properly/cleanly:
80	# https://unix.stackexchange.com/questions/166546/bash-cannot-break-out-of-piped-while-read-loop-process-substitution-works
81	while read line; do
82	if [[ $line = *cluster.idx ]]; then
83	#echo "Found cluster.idx mentioned at: $line"
84	cluster_url=$aws_prefix$line
85	echo "Will download the cluster.idx file from $cluster_url"
86	echo ""
87	break
88	fi
89	done < <(tac $infile)
90
91	# make sure we have a cluster.idx URL
92	if [ -z $1 ]; then
93	echo "Could not find the cluster.idx listed in $infile"
94	echo ""
95	exit -1
96	fi
97
98	# Work in the tmp folder for a while
99	pushd tmp
100
101	# Now let's get cluster.idx
102	wget $cluster_url
103
104	if [ ! -f cluster.idx ]; then
105	echo "Downloading of $cluster_url failed."
106	echo ""
107	exit -1
108	fi
109
110	# Hereafter, the amazon prefix is the parent path of the full URL to cluster.idx
111	aws_prefix=${cluster_url%cluster.idx}
112	#echo "New prefix: $aws_prefix"
113
114	outfile=nz-only-TLDs-`date +%F`.txt
115	touch $outfile
116
117	while read line; do
118	urls_file=$line
119	echo "Found .nz TLD mentioned in: $urls_file (located at in $aws_prefix). Will wget and extract."
120	echo ""
121
122	# get the large file (under 1GB)
123	wget "$aws_prefix$urls_file"
124	# Unzip each, and we have all URLs with TLD .nz, but perhaps also .no for Norway in the first and .org in the last
125	# We just want the nz domains.
126
127	if [ ! -f $urls_file ]; then
128	echo "ERROR: Downloading of $urls_file containing .nz specific URLs failed."
129	echo ""
130	break
131	# Let's not exit, but go around the loop trying to download any other gz files also mentioning the .nz TLD
132	else
133	# Extract the huge file
134	echo "Extracting..."
135	echo " (This may take a minute or so, please be patient.)"
136	gunzip $urls_file
137	echo ""
138
139	echo "Reducing the downloaded URLs to just those with toplevel domain .nz..."
140	echo ""
141
142	# Extract just those URLs that start with "^nz," (since the SURT form of URLs, the form starting with the TLD) come first on each line
143	# and append to $outfile of all .nz TLDs for the specific crawl
144	# See https://www.unix.com/shell-programming-and-scripting/176608-how-copy-lines-starts-either-3-4-into-new-file.html
145	urls_file=${urls_file%.gz}
146	egrep "^nz," $urls_file >> $outfile
147	fi
148	done < <(grep '^nz,' cluster.idx \| cut -f2 \| uniq)
149
150	mv $outfile ../.
151	# get out of the tmp dir
152	popd
153
154	echo ""
155	echo "The file $outfile has now been created, containing all the .nz domains for the crawl of $1"
156	echo ""
157
158	uniq_urls_file=uniq-tld-nz-urls-`date +%F`.txt
159	echo "Creating file $uniq_urls_file containing just the unique domains and subdomains..."
160	cat $outfile \| cut -d ' ' -f4 \| cut -d/ -f3 \| sed -r 's@\.?",\s*$@@' \| sed -r 's@^www\.@@' \| uniq > $uniq_urls_file
161
162	# The first cut grabs the url field of the json.
163	# The second cut grabs the domain name from the url (located between first // and immediately subsequent /).
164	# The first sed process then removes any trailing . (e.g. "massey.ac.nz." becomes "massey.ac.nz") followed by ", and optional spaces before the end,
165	# and the final sed removes any "www." prefix.
166	# Then we get the uniq urls out of all that.
167
168	echo "File $uniq_urls_file containing just the unique .nz sites (domains and subdomains) to be used as seed urls has now been created."
169	num_uniq_urls=`wc -l $uniq_urls_file`
170	total_urls=`wc -l $outfile`
171	echo ""
172	echo ""
173	echo "Summary:"
174	echo "There were $num_uniq_urls unique sites"
175	echo "out of a total of $total_urls urls in $outfile."
176	echo ""
177	echo ""
178
179	echo ""
180	echo "NEXT:"
181	echo "Remember to delete the products in the tmp folder or the folder itself after inspecting its contents."
182	echo ""
183	exit 0

Note: See TracBrowser for help on using the repository browser.

Download in other formats: