#!/bin/bash ##################################################################################################### # This script follows the instructions at https://groups.google.com/d/msg/common-crawl/3QmQjFA_3y4/vTbhGqIBBQAJ # to download all the .nz TLD URLs that CommonCrawl has crawled for a given date. # It needs to be provided with either a cc-index.paths.gz file for a particular date or # the URL for such a file, both obtainable from http://index.commoncrawl.org/ ##################################################################################################### function printUsage() { echo "" echo "Usage: $0 " echo " You can download such a file or copy its URL from http://index.commoncrawl.org/" echo "" } # Need to run this script from the top level folder of this extension # This script needs to be provided with a cc-index.paths.gz downloaded from http://index.commoncrawl.org/ # or a URL of the form https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-30/cc-index.paths.gz if [ -z $1 ]; then printUsage exit 1 fi # now we have a $1 if [[ "x$1" = "x--help" || "x$1" = "x-h" ]]; then printUsage exit 0 fi # We'll create a temporary directory to work in and tell the user to delete it after we're done mkdir -p tmp # some sanity checking infile=$1 # if $1 contains the correct format for the URL to the cc-index.paths.gz, then wget it and make that file the if [[ $infile == https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-*/cc-index.paths.gz ]]; then wget $infile if [ ! -f cc-index.paths.gz ]; then echo "Downloading of $infile failed." exit -1 fi mv cc-index.paths.gz tmp/. infile=tmp/cc-index.paths.gz fi if [[ $infile != *cc-index.paths.gz ]]; then echo "*** Invalid parameter: This script can only work with the cc-index.paths.gz file or URL to such a file" echo "" printUsage exit -1 fi # decompress. gunzip removes the .gz archive file and just leaves the decompressed version (same filename, without .gz) # So infile hereafter is without the .gz suffix, https://wiki.bash-hackers.org/syntax/pe#substring_removal gunzip $infile infile=${infile%.gz} #echo "Finished extracting $infile from the download" aws_prefix=https://commoncrawl.s3.amazonaws.com/ cluster_url= # read in file line-by-line in reverse (cat-reversed tac instead of cat), and store the cluster.idx line # http://forums.codeguru.com/showthread.php?472039-RESOLVED-BASH-read-from-end-of-file-upwards #tac $infile | while read line; do # if [[ $line = *cluster.idx ]]; then # echo "Found cluster.idx at: $line" # cluster_url=$aws_prefix$line # echo "Will download cluster.idx file from $cluster_url" # fi #done # Above way is bad because we can't BREAK out of the outer process (tac, reading each line in) when we've found the line we want. # Instead, do it the following way: to read until we found the line of interest so we can then BREAK out of the loop properly/cleanly: # https://unix.stackexchange.com/questions/166546/bash-cannot-break-out-of-piped-while-read-loop-process-substitution-works while read line; do if [[ $line = *cluster.idx ]]; then #echo "Found cluster.idx mentioned at: $line" cluster_url=$aws_prefix$line echo "Will download the cluster.idx file from $cluster_url" echo "" break fi done < <(tac $infile) # make sure we have a cluster.idx URL if [ -z $1 ]; then echo "Could not find the cluster.idx listed in $infile" echo "" exit -1 fi # Work in the tmp folder for a while pushd tmp # Now let's get cluster.idx wget $cluster_url if [ ! -f cluster.idx ]; then echo "Downloading of $cluster_url failed." echo "" exit -1 fi # Hereafter, the amazon prefix is the parent path of the full URL to cluster.idx aws_prefix=${cluster_url%cluster.idx} #echo "New prefix: $aws_prefix" outfile=nz-only-TLDs-`date +%F`.txt touch $outfile while read line; do urls_file=$line echo "Found .nz TLD mentioned in: $urls_file (located at in $aws_prefix). Will wget and extract." echo "" # get the large file (under 1GB) wget "$aws_prefix$urls_file" # Unzip each, and we have all URLs with TLD .nz, but perhaps also .no for Norway in the first and .org in the last # We just want the nz domains. if [ ! -f $urls_file ]; then echo "ERROR: Downloading of $urls_file containing .nz specific URLs failed." echo "" break # Let's not exit, but go around the loop trying to download any other gz files also mentioning the .nz TLD else # Extract the huge file echo "Extracting..." echo " (This may take a minute or so, please be patient.)" gunzip $urls_file echo "" echo "Reducing the downloaded URLs to just those with toplevel domain .nz..." echo "" # Extract just those URLs that start with "^nz," (since the SURT form of URLs, the form starting with the TLD) come first on each line # and append to $outfile of all .nz TLDs for the specific crawl # See https://www.unix.com/shell-programming-and-scripting/176608-how-copy-lines-starts-either-3-4-into-new-file.html urls_file=${urls_file%.gz} egrep "^nz," $urls_file >> $outfile fi done < <(grep '^nz,' cluster.idx | cut -f2 | uniq) mv $outfile ../. # get out of the tmp dir popd echo "" echo "The file $outfile has now been created, containing all the .nz domains for the crawl of $1" echo "Remember to delete the products in the tmp folder or the folder itself after inspecting its contents" echo "" exit 0