Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

source: gs3-extensions/maori-lang-detection/bin/script/get_commoncrawl_nz_urls.sh@ 33379

Last change on this file since 33379 was 33379, checked in by ak19, 5 years ago
New script to automate getting a file listing of the common crawl URL data of all the .nz TLDs (toplevel domain).
Property svn:executable set to ``*
File size: 5.5 KB

Rev	Line
[33379]	1	#!/bin/bash
	2
	3	#####################################################################################################
	4	# This script follows the instructions at https://groups.google.com/d/msg/common-crawl/3QmQjFA_3y4/vTbhGqIBBQAJ
	5	# to download all the .nz TLD URLs that CommonCrawl has crawled for a given date.
	6	# It needs to be provided with either a cc-index.paths.gz file for a particular date or
	7	# the URL for such a file, both obtainable from http://index.commoncrawl.org/
	8	#####################################################################################################
	9
	10
	11	function printUsage() {
	12	echo ""
	13	echo "Usage: $0 <cc-index.paths.gz \| URL to a cc-index.paths.gz file, e.g. https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-30/cc-index.paths.gz>"
	14	echo " You can download such a file or copy its URL from http://index.commoncrawl.org/"
	15	echo ""
	16	}
	17
	18
	19
	20	# Need to run this script from the top level folder of this extension
	21	# This script needs to be provided with a cc-index.paths.gz downloaded from http://index.commoncrawl.org/
	22	# or a URL of the form https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-30/cc-index.paths.gz
	23	if [ -z $1 ]; then
	24	printUsage
	25	exit 1
	26	fi
	27
	28	# now we have a $1
	29	if [[ "x$1" = "x--help" \|\| "x$1" = "x-h" ]]; then
	30	printUsage
	31	exit 0
	32	fi
	33
	34	# We'll create a temporary directory to work in and tell the user to delete it after we're done
	35	mkdir -p tmp
	36
	37	# some sanity checking
	38	infile=$1
	39	# if $1 contains the correct format for the URL to the cc-index.paths.gz, then wget it and make that file the
	40	if [[ $infile == https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-*/cc-index.paths.gz ]]; then
	41	wget $infile
	42
	43	if [ ! -f cc-index.paths.gz ]; then
	44	echo "Downloading of $infile failed."
	45	exit -1
	46	fi
	47
	48	mv cc-index.paths.gz tmp/.
	49	infile=tmp/cc-index.paths.gz
	50	fi
	51
	52	if [[ $infile != *cc-index.paths.gz ]]; then
	53	echo "*** Invalid parameter: This script can only work with the cc-index.paths.gz file or URL to such a file"
	54	echo ""
	55	printUsage
	56	exit -1
	57	fi
	58
	59	# decompress. gunzip removes the .gz archive file and just leaves the decompressed version (same filename, without .gz)
	60	# So infile hereafter is without the .gz suffix, https://wiki.bash-hackers.org/syntax/pe#substring_removal
	61	gunzip $infile
	62	infile=${infile%.gz}
	63	#echo "Finished extracting $infile from the download"
	64
	65	aws_prefix=https://commoncrawl.s3.amazonaws.com/
	66	cluster_url=
	67	# read in file line-by-line in reverse (cat-reversed tac instead of cat), and store the cluster.idx line
	68	# http://forums.codeguru.com/showthread.php?472039-RESOLVED-BASH-read-from-end-of-file-upwards
	69
	70	#tac $infile \| while read line; do
	71	# if [[ $line = *cluster.idx ]]; then
	72	# echo "Found cluster.idx at: $line"
	73	# cluster_url=$aws_prefix$line
	74	# echo "Will download cluster.idx file from $cluster_url"
	75	# fi
	76	#done
	77
	78	# Above way is bad because we can't BREAK out of the outer process (tac, reading each line in) when we've found the line we want.
	79	# Instead, do it the following way: to read until we found the line of interest so we can then BREAK out of the loop properly/cleanly:
	80	# https://unix.stackexchange.com/questions/166546/bash-cannot-break-out-of-piped-while-read-loop-process-substitution-works
	81	while read line; do
	82	if [[ $line = *cluster.idx ]]; then
	83	#echo "Found cluster.idx mentioned at: $line"
	84	cluster_url=$aws_prefix$line
	85	echo "Will download the cluster.idx file from $cluster_url"
	86	echo ""
	87	break
	88	fi
	89	done < <(tac $infile)
	90
	91	# make sure we have a cluster.idx URL
	92	if [ -z $1 ]; then
	93	echo "Could not find the cluster.idx listed in $infile"
	94	echo ""
	95	exit -1
	96	fi
	97
	98	# Work in the tmp folder for a while
	99	pushd tmp
	100
	101	# Now let's get cluster.idx
	102	wget $cluster_url
	103
	104	if [ ! -f cluster.idx ]; then
	105	echo "Downloading of $cluster_url failed."
	106	echo ""
	107	exit -1
	108	fi
	109
	110	# Hereafter, the amazon prefix is the parent path of the full URL to cluster.idx
	111	aws_prefix=${cluster_url%cluster.idx}
	112	#echo "New prefix: $aws_prefix"
	113
	114	outfile=nz-only-TLDs-`date +%F`.txt
	115	touch $outfile
	116
	117	while read line; do
	118	urls_file=$line
	119	echo "Found .nz TLD mentioned in: $urls_file (located at in $aws_prefix). Will wget and extract."
	120	echo ""
	121
	122	# get the large file (under 1GB)
	123	wget "$aws_prefix$urls_file"
	124	# Unzip each, and we have all URLs with TLD .nz, but perhaps also .no for Norway in the first and .org in the last
	125	# We just want the nz domains.
	126
	127	if [ ! -f $urls_file ]; then
	128	echo "ERROR: Downloading of $urls_file containing .nz specific URLs failed."
	129	echo ""
	130	break
	131	# Let's not exit, but go around the loop trying to download any other gz files also mentioning the .nz TLD
	132	else
	133	# Extract the huge file
	134	echo "Extracting..."
	135	gunzip $urls_file
	136	echo ""
	137
	138	echo "Reducing the downloaded URLs to just those with toplevel domain .nz..."
	139	echo ""
	140
	141	# Extract just those URLs that start with "^nz," (since the SURT form of URLs, the form starting with the TLD) come first on each line
	142	# and append to $outfile of all .nz TLDs for the specific crawl
	143	# See https://www.unix.com/shell-programming-and-scripting/176608-how-copy-lines-starts-either-3-4-into-new-file.html
	144	urls_file=${urls_file%.gz}
	145	egrep "^nz," $urls_file >> $outfile
	146	fi
	147	done < <(grep '^nz,' cluster.idx \| cut -f2 \| uniq)
	148
	149	mv $outfile ../.
	150	# get out of the tmp dir
	151	popd
	152
	153	echo ""
	154	echo "The file $outfile has now been created, containing all the .nz domains for the crawl of $1"
	155	echo "Remember to delete the products in the tmp folder or the folder itself after inspecting its contents"
	156	echo ""
	157	exit 0

Note: See TracBrowser for help on using the repository browser.

Download in other formats: