source: gs3-extensions/maori-lang-detection/bin/script/create-uniq-WET-urls-file.sh@ 33413

Last change on this file since 33413 was 33413, checked in by ak19, 5 years ago

Splitting the get_commoncrawl_nz_urls.sh script back into 2 scripts, itself and create-uniq-nz-urls-file.sh. Added a new script, create-uniq-WET-urls-file.sh, to get the WET urls once we have all the CC URLs for the .nz TLD.

  • Property svn:executable set to *
File size: 3.1 KB
Line 
1#!/bin/bash
2
3# Need to run this script from the top level folder of this extension
4# This script needs to be provided with the nz-only-TLDs-<date>.txt file produced by running get_commoncrawl_nz_urls.sh
5if [ -z $1 ]; then
6 echo ""
7 echo "Usage: $0 nz-only-TLDs-<date>.txt"
8 echo " where the nz-only-TLDs-<date>.txt file is produced by running get_commoncrawl_nz_urls.sh."
9 echo ""
10 exit 1
11fi
12
13uniq_urls_file=uniq-tld-nz-WET-urls-`date +%F`.txt
14echo "Creating file $uniq_urls_file containing just the unique WET gz urls for the .nz TLD..."
15
16# cat is unnecessary: https://stackoverflow.com/questions/1915636/is-there-a-way-to-uniq-by-column/1915750
17# "You can dump the cat! Rather than piping into <the subsequent process>, just let <the subsequent process> read the file using <. Piping through cat is a common unnecessary complication used by novices. For large amounts of data there's a performance effect to be had."
18
19#<$1 cut -d ' ' -f18 | sed -r 's@\.?"(,|})\s*$@@' | sed -r 's@^\s*"@@' > $uniq_urls_file
20#sort -u $uniq_urls_file > tmp.txt
21
22# The first cut grabs the filename field of the json, containing the relative path to the WET file
23# The sed operations remove any starting and terminating double quotes followed by , or }
24# Then we get the uniq urls out of all that.
25
26# The above is imperfect because space is not a proper field separator (some json property values
27# contain spaces)
28# https://unix.stackexchange.com/questions/155952/should-i-use-piping-or-redirection-for-input-to-sort
29tmpfile=tmp_WET_urls.txt
30
31<$1 sed -r 's@^.*"filename":\s*"@@' | sed -r 's@\.?"(,|}).*$@@' > $tmpfile
32sort -u $tmpfile > sorted_WET_urls.txt
33
34rm $tmpfile
35
36# remove all lines containing paths like /crawldiagnostics/ and /robotstxt/
37# https://stackoverflow.com/questions/17049962/delete-line-containing-one-of-multiple-strings
38
39# And convert links like
40# https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-30/segments/1563195523840.34/warc/CC-MAIN-20190715175205-20190715200159-00000.warc.gz
41# to:
42# https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-30/segments/1563195523840.34/wet/CC-MAIN-20190715175205-20190715200159-00000.warc.wet.gz
43
44awk '!/crawldiagnostics|robotstxt/' sorted_WET_urls.txt | sed -r 's@\/warc\/CC-MAIN-@/wet/CC-MAIN-@' | sed -r 's@\.warc\[email protected]@' > $uniq_urls_file
45
46rm sorted_WET_urls.txt
47
48# All urls need to be prefixed with:
49#ccprefix=https://commoncrawl.s3.amazonaws.com/
50
51
52# From each line of json of the input file, we want the filename value filed containing
53# the relative path to the gzipped WET file.
54# The first sed grabs everything after the filename field in each line/json
55# The second sed operation remove any terminating double quotes followed by , or }
56# Then we sort -u to get the uniq urls out of all that.
57
58echo "File $uniq_urls_file containing just the unique commoncrawl WET urls of the .nz TLD has now been created."
59num_uniq_urls=`wc -l $uniq_urls_file`
60total_urls=`wc -l $1`
61echo ""
62echo ""
63echo "Summary:"
64echo "There were $num_uniq_urls unique WET urls"
65echo "out of a total of $total_urls urls in $1."
66echo ""
67echo ""
68
69exit 0
Note: See TracBrowser for help on using the repository browser.