#!/bin/bash

#####################################################################################################
# This script follows the instructions at https://groups.google.com/d/msg/common-crawl/3QmQjFA_3y4/vTbhGqIBBQAJ
# to download all the .nz TLD URLs that CommonCrawl has crawled for a given date.
# It needs to be provided with either a cc-index.paths.gz file for a particular date or
# the URL for such a file, both obtainable from http://index.commoncrawl.org/
#####################################################################################################


function printUsage() {
    echo ""
    echo "Usage: $0 <cc-index.paths.gz | URL to a cc-index.paths.gz file, e.g. https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-30/cc-index.paths.gz>"
    echo "   You can download such a file or copy its URL from http://index.commoncrawl.org/"
    echo ""
}


# Need to run this script from the top level folder of this extension
# This script needs to be provided with a cc-index.paths.gz downloaded from http://index.commoncrawl.org/
# or a URL of the form https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-30/cc-index.paths.gz
if [ -z $1 ]; then
    printUsage
    exit 1
fi

# now we have a $1
if [[ "x$1" = "x--help" || "x$1" = "x-h" ]]; then
    printUsage
    exit 0
fi	

# We'll create a temporary directory to work in and tell the user to delete it after we're done
mkdir -p tmp

# some sanity checking
infile=$1
# if $1 contains the correct format for the URL to the cc-index.paths.gz, then wget it and make that file the 
if [[ $infile == https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-*/cc-index.paths.gz ]]; then
    wget $infile

    if [ ! -f cc-index.paths.gz ]; then
	echo "Downloading of $infile failed."
	exit -1
    fi
    
    mv cc-index.paths.gz tmp/.
    infile=tmp/cc-index.paths.gz
fi

if [[ $infile != *cc-index.paths.gz ]]; then
    echo "*** Invalid parameter: This script can only work with the cc-index.paths.gz file or URL to such a file"
    echo ""
    printUsage
    exit -1
fi

# decompress. gunzip removes the .gz archive file and just leaves the decompressed version (same filename, without .gz)
# So infile hereafter is without the .gz suffix, https://wiki.bash-hackers.org/syntax/pe#substring_removal
gunzip $infile
infile=${infile%.gz}
#echo "Finished extracting $infile from the download"

aws_prefix=https://commoncrawl.s3.amazonaws.com/
cluster_url=
# read in file line-by-line in reverse (cat-reversed tac instead of cat), and store the cluster.idx line
# http://forums.codeguru.com/showthread.php?472039-RESOLVED-BASH-read-from-end-of-file-upwards

#tac $infile | while read line; do
#    if [[ $line = *cluster.idx ]]; then
#	echo "Found cluster.idx at: $line"
#	cluster_url=$aws_prefix$line
#	echo "Will download cluster.idx file from $cluster_url"
#    fi
#done

# Above way is bad because we can't BREAK out of the outer process (tac, reading each line in) when we've found the line we want.
# Instead, do it the following way: to read until we found the line of interest so we can then BREAK out of the loop properly/cleanly:
# https://unix.stackexchange.com/questions/166546/bash-cannot-break-out-of-piped-while-read-loop-process-substitution-works
while read line; do
    if [[ $line = *cluster.idx ]]; then
	#echo "Found cluster.idx mentioned at: $line"
	cluster_url=$aws_prefix$line
	echo "Will download the cluster.idx file from $cluster_url"
	echo ""
	break
    fi
done < <(tac $infile) 

# make sure we have a cluster.idx URL
if [ -z $1 ]; then
    echo "Could not find the cluster.idx listed in $infile"
    echo ""
    exit -1
fi

# Work in the tmp folder for a while
pushd tmp

# Now let's get cluster.idx
wget $cluster_url

if [ ! -f cluster.idx ]; then
    echo "Downloading of $cluster_url failed."
    echo ""
    exit -1
fi

# Hereafter, the amazon prefix is the parent path of the full URL to cluster.idx
aws_prefix=${cluster_url%cluster.idx}
#echo "New prefix: $aws_prefix"

outfile=nz-only-TLDs-`date +%F`.txt
touch $outfile

while read line; do
    urls_file=$line
    echo "Found .nz TLD mentioned in: $urls_file (located at in $aws_prefix). Will wget and extract."
    echo ""
    
    # get the large file (under 1GB)
    wget "$aws_prefix$urls_file"
    # Unzip each, and we have all URLs with TLD .nz, but perhaps also .no for Norway in the first and .org in the last
    # We just want the nz domains. 
    
    if [ ! -f $urls_file ]; then
    	echo "ERROR: Downloading of $urls_file containing .nz specific URLs failed."
    	echo ""
    	break
	# Let's not exit, but go around the loop trying to download any other gz files also mentioning the .nz TLD
    else
	# Extract the huge file
    	echo "Extracting..."
	echo "   (This may take a minute or so, please be patient.)"
    	gunzip $urls_file
    	echo ""

	echo "Reducing the downloaded URLs to just those with toplevel domain .nz..."
	echo ""
    
	# Extract just those URLs that start with "^nz," (since the SURT form of URLs, the form starting with the TLD) come first on each line
	# and append to $outfile of all .nz TLDs for the specific crawl
	# See https://www.unix.com/shell-programming-and-scripting/176608-how-copy-lines-starts-either-3-4-into-new-file.html
	urls_file=${urls_file%.gz}
	egrep "^nz," $urls_file >> $outfile
    fi
done < <(grep '^nz,' cluster.idx | cut -f2 | uniq)

mv $outfile ../.
# get out of the tmp dir
popd

echo ""
echo "The file $outfile has now been created, containing all the .nz domains for the crawl of $1"
echo "Remember to delete the products in the tmp folder or the folder itself after inspecting its contents"
echo ""
exit 0