source: gs3-extensions/maori-lang-detection/hdfs-cc-work/scripts/setup.sh@ 33535

Last change on this file since 33535 was 33535, checked in by ak19, 5 years ago
  1. New setup.sh script for on a hadoop system to setup the git projects we use for processing CommonCrawl data the way we've been doing so far. This script can be split into INSTALL.sh and COMPILE.sh hereafter. 2. Modifications to the overall Readme.txt
File size: 3.1 KB
Line 
1#!/bin/bash
2
3# set value to "copy_extra_jars" if you want certain additional jars copied over,
4# a step that may be optional or may be necessary
5copy_extra_jars=$1
6
7echo "Checking out hdfs-cc-work from GS svn's gs3-extensions/maori-lang-detection"
8svn co http://svn.greenstone.org/gs3-extensions/maori-lang-detection/hdfs-cc-work
9
10echo "Moving the tarred 3 git projects, cc-index-table (modified), ia-hadoop-tools (modified) and ia-web-commons into the current folder and extracting..."
11mv hdfs-cc-work/gitprojects/cc-index-table.tar .
12mv hdfs-cc-work/gitprojects/ia-hadoop-tools.tar .
13mv hdfs-cc-work/gitprojects/ia-web-commons.tar .
14
15tar -xvf cc-index-table.tar
16tar -xvf ia-hadoop-tools.tar
17tar -xvf ia-web-commons.tar
18
19
20echo "Copying scripts into cc-index-table and making them executable"
21# the script that does the actual work
22cp hdfs-cc-work/scripts/get_maori_WET_records_for_crawl.sh cc-index-table/src/script/.
23chmod u+x cc-index-table/src/script/get_maori_WET_records_for_crawl.sh
24# top level script belonging in top level of cc-index-table
25cp hdfs-cc-work/scripts/get_Maori_WET_records_from_CCSep2018_on.sh cc-index-table/.
26chmod u+x cc-index-table/get_Maori_WET_records_from_CCSep2018_on.sh
27
28
29echo "Compiling the modified cc-index-table"
30cd cc-index-table
31mvn package
32
33# ia-web-commons has to be compiled up before ia-hadoop-tools
34echo "Compiling ia-web-commons"
35cd ../ia-web-commons
36mvn install
37
38echo "Compiling the modified ia-hadoop-tools"
39cd ../ia-hadoop-tools
40mvn package
41
42
43echo "Putting jar files in place"
44
45echo "1. For RUNNING warc to wet conversion (ia-hadoop-tools): Sudo copying hdfs-cc-work/jars/guava.jar into hadoop classpath $hadoop_common_dir"
46sudo cp hdfs-cc-work/jars/guava.jar $hadoop_common_dir/.
47
48# Some discussion about == and = allowed in if test for equality
49# https://stackoverflow.com/questions/4277665/how-do-i-compare-two-string-variables-in-an-if-statement-in-bash
50if [ "x$copy_extra_jars" == "xcopy_extra_jars" ]; then
51 hadoop_common_dir=/usr/local/hadoop/share/hadoop/common
52 echo "Copying extra jars"
53 echo "2. For cc-index-table: Sudo copying hadoop-aws-2.7.6.jar and aws-java-sdk-1.11.616.jar"
54 echo "from hdfs-cc-work/jars into hadoop classpath $hadoop_common_dir"
55 sudo cp hadoop-aws-2.7.6.jar $hadoop_common_dir/.
56 sudo cp aws-java-sdk-1.11.616.jar $hadoop_common_dir/.
57
58 dest_dir=/home/vagrant/.m2/repository/com/amazonaws/aws-java-sdk/1.7.4
59 if [ ! -e "$dest_dir/aws-java-sdk-1.7.4.jar" ]; then
60 echo "3. For cc-index-table: Copying aws-java-sdk-1.7.4.jar into $dest_dir/"
61 mkdir -p $dest_dir
62 cp hdfs-cc-work/jars/aws-java-sdk-1.7.4.jar $dest_dir/.
63 fi
64
65 dest_dir=/home/vagrant/.m2/repository/org/apache/hadoop/hadoop-aws/2.7.6
66 if [ ! -e "$dest_dir/hadoop-aws-2.7.6.jar" ]; then
67 echo "4. For cc-index-table: Copying hadoop-aws-2.7.6.jar into $dest_dir/"
68 mkdir -p $dest_dir
69 cp hdfs-cc-work/jars/aws-java-sdk-1.7.4.jar $dest_dir/.
70 fi
71fi
72
73echo "Done compiling and setting up."
74echo "To get MRI warc to wet for a particular crawl timestamp, cd into cc-index-table and RUN:"
75echo "./get_maori_WET_records_for_crawl.sh CC-MAIN-<YYYY-##>"
76echo " where YYYY is the year and ## is the common crawl number for that year."
77
Note: See TracBrowser for help on using the repository browser.