source: gs3-extensions/maori-lang-detection/hdfs-cc-work/scripts/setup.sh@ 33538

Last change on this file since 33538 was 33538, checked in by ak19, 5 years ago

Some additions to the setup.sh script to query commoncrawl for MRI data on hadoop before I commit what I've done to crawl with Autistici's crawl software.

File size: 3.8 KB
Line 
1#!/bin/bash
2
3# set value to "copy_extra_jars" if you want certain additional jars copied over,
4# a step that may be optional or may be necessary
5copy_extra_jars=$1
6
7echo "Checking out hdfs-cc-work from GS svn's gs3-extensions/maori-lang-detection"
8svn co http://svn.greenstone.org/gs3-extensions/maori-lang-detection/hdfs-cc-work
9
10echo "Moving the tarred 3 git projects, cc-index-table (modified), ia-hadoop-tools (modified) and ia-web-commons into the current folder and extracting..."
11mv hdfs-cc-work/gitprojects/cc-index-table.tar .
12mv hdfs-cc-work/gitprojects/ia-hadoop-tools.tar .
13mv hdfs-cc-work/gitprojects/ia-web-commons.tar .
14
15tar -xvf cc-index-table.tar
16tar -xvf ia-hadoop-tools.tar
17tar -xvf ia-web-commons.tar
18
19
20echo "Copying scripts into cc-index-table and making them executable"
21# the script that does the actual work
22cp hdfs-cc-work/scripts/get_maori_WET_records_for_crawl.sh cc-index-table/src/script/.
23chmod u+x cc-index-table/src/script/get_maori_WET_records_for_crawl.sh
24# top level script belonging in top level of cc-index-table
25cp hdfs-cc-work/scripts/get_Maori_WET_records_from_CCSep2018_on.sh cc-index-table/.
26chmod u+x cc-index-table/get_Maori_WET_records_from_CCSep2018_on.sh
27
28
29echo "Compiling the modified cc-index-table"
30cd cc-index-table
31mvn package
32
33# ia-web-commons has to be compiled up before ia-hadoop-tools
34echo "Compiling ia-web-commons"
35cd ../ia-web-commons
36mvn install
37
38echo "Compiling the modified ia-hadoop-tools"
39cd ../ia-hadoop-tools
40mvn package
41
42
43echo "Putting jar files in place"
44
45echo "1. For RUNNING warc to wet conversion (ia-hadoop-tools): Sudo copying hdfs-cc-work/jars/guava.jar into hadoop classpath $hadoop_common_dir"
46sudo cp hdfs-cc-work/jars/guava.jar $hadoop_common_dir/.
47
48# Some discussion about == and = allowed in if test for equality
49# https://stackoverflow.com/questions/4277665/how-do-i-compare-two-string-variables-in-an-if-statement-in-bash
50if [ "x$copy_extra_jars" == "xcopy_extra_jars" ]; then
51 hadoop_common_dir=/usr/local/hadoop/share/hadoop/common
52 echo "Copying extra jars"
53 echo "2. For cc-index-table: Sudo copying hadoop-aws-2.7.6.jar and aws-java-sdk-1.11.616.jar"
54 echo "from hdfs-cc-work/jars into hadoop classpath $hadoop_common_dir"
55 sudo cp hadoop-aws-2.7.6.jar $hadoop_common_dir/.
56 sudo cp aws-java-sdk-1.11.616.jar $hadoop_common_dir/.
57
58 dest_dir=/home/vagrant/.m2/repository/com/amazonaws/aws-java-sdk/1.7.4
59 if [ ! -e "$dest_dir/aws-java-sdk-1.7.4.jar" ]; then
60 echo "3. For cc-index-table: Copying aws-java-sdk-1.7.4.jar into $dest_dir/"
61 mkdir -p $dest_dir
62 cp hdfs-cc-work/jars/aws-java-sdk-1.7.4.jar $dest_dir/.
63 fi
64
65 dest_dir=/home/vagrant/.m2/repository/org/apache/hadoop/hadoop-aws/2.7.6
66 if [ ! -e "$dest_dir/hadoop-aws-2.7.6.jar" ]; then
67 echo "4. For cc-index-table: Copying hadoop-aws-2.7.6.jar into $dest_dir/"
68 mkdir -p $dest_dir
69 cp hdfs-cc-work/jars/aws-java-sdk-1.7.4.jar $dest_dir/.
70 fi
71fi
72
73echo "Done compiling and automated parts of setting up."
74echo "NEXT STEP:"
75echo "Ensure you have sudo edited $SPARK_HOME/conf/spark-defaults.conf"
76echo " (/usr/local/spark-2.3.0-bin-hadoop2.7/conf/spark-defaults.conf)"
77echo "to contain the following 3 lines with YOUR Amazon AWS IAM Role access and secret keys:"
78echo " spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem"
79echo " spark.hadoop.fs.s3a.access.key=YOUR_AWS_IAM-ROLE_ACCESSKEY_HERE"
80echo " spark.hadoop.fs.s3a.secret.key=YOUR_AWS_IAM-ROLE_SECRETKEY_HERE"
81echo "Consult GS_README.TXT section B (and C) for instructions on setting up an AWS IAM role."
82echo "Only when that's done will you be ready to run the following script."
83echo ""
84echo "THEN:"
85echo "To get MRI warc to wet for a particular crawl timestamp, cd into cc-index-table and RUN:"
86echo "./get_maori_WET_records_for_crawl.sh CC-MAIN-<YYYY-##>"
87echo " where YYYY is the year and ## is the common crawl number for that year."
88
Note: See TracBrowser for help on using the repository browser.