1 | #!/bin/bash
|
---|
2 |
|
---|
3 | # set value to "copy_extra_jars" if you want certain additional jars copied over,
|
---|
4 | # a step that may be optional or may be necessary
|
---|
5 | copy_extra_jars=$1
|
---|
6 |
|
---|
7 | echo "Checking out hdfs-cc-work from GS svn's gs3-extensions/maori-lang-detection"
|
---|
8 | svn co http://svn.greenstone.org/gs3-extensions/maori-lang-detection/hdfs-cc-work
|
---|
9 |
|
---|
10 | echo "Moving the tarred 3 git projects, cc-index-table (modified), ia-hadoop-tools (modified) and ia-web-commons into the current folder and extracting..."
|
---|
11 | mv hdfs-cc-work/gitprojects/cc-index-table.tar .
|
---|
12 | mv hdfs-cc-work/gitprojects/ia-hadoop-tools.tar .
|
---|
13 | mv hdfs-cc-work/gitprojects/ia-web-commons.tar .
|
---|
14 |
|
---|
15 | tar -xvf cc-index-table.tar
|
---|
16 | tar -xvf ia-hadoop-tools.tar
|
---|
17 | tar -xvf ia-web-commons.tar
|
---|
18 |
|
---|
19 |
|
---|
20 | echo "Copying scripts into cc-index-table and making them executable"
|
---|
21 | # the script that does the actual work
|
---|
22 | cp hdfs-cc-work/scripts/get_maori_WET_records_for_crawl.sh cc-index-table/src/script/.
|
---|
23 | chmod u+x cc-index-table/src/script/get_maori_WET_records_for_crawl.sh
|
---|
24 | # top level script belonging in top level of cc-index-table
|
---|
25 | cp hdfs-cc-work/scripts/get_Maori_WET_records_from_CCSep2018_on.sh cc-index-table/.
|
---|
26 | chmod u+x cc-index-table/get_Maori_WET_records_from_CCSep2018_on.sh
|
---|
27 |
|
---|
28 |
|
---|
29 | echo "Compiling the modified cc-index-table"
|
---|
30 | cd cc-index-table
|
---|
31 | mvn package
|
---|
32 |
|
---|
33 | # ia-web-commons has to be compiled up before ia-hadoop-tools
|
---|
34 | echo "Compiling ia-web-commons"
|
---|
35 | cd ../ia-web-commons
|
---|
36 | mvn install
|
---|
37 |
|
---|
38 | echo "Compiling the modified ia-hadoop-tools"
|
---|
39 | cd ../ia-hadoop-tools
|
---|
40 | mvn package
|
---|
41 |
|
---|
42 |
|
---|
43 | echo "Putting jar files in place"
|
---|
44 |
|
---|
45 | echo "1. For RUNNING warc to wet conversion (ia-hadoop-tools): Sudo copying hdfs-cc-work/jars/guava.jar into hadoop classpath $hadoop_common_dir"
|
---|
46 | sudo cp hdfs-cc-work/jars/guava.jar $hadoop_common_dir/.
|
---|
47 |
|
---|
48 | # Some discussion about == and = allowed in if test for equality
|
---|
49 | # https://stackoverflow.com/questions/4277665/how-do-i-compare-two-string-variables-in-an-if-statement-in-bash
|
---|
50 | if [ "x$copy_extra_jars" == "xcopy_extra_jars" ]; then
|
---|
51 | hadoop_common_dir=/usr/local/hadoop/share/hadoop/common
|
---|
52 | echo "Copying extra jars"
|
---|
53 | echo "2. For cc-index-table: Sudo copying hadoop-aws-2.7.6.jar and aws-java-sdk-1.11.616.jar"
|
---|
54 | echo "from hdfs-cc-work/jars into hadoop classpath $hadoop_common_dir"
|
---|
55 | sudo cp hadoop-aws-2.7.6.jar $hadoop_common_dir/.
|
---|
56 | sudo cp aws-java-sdk-1.11.616.jar $hadoop_common_dir/.
|
---|
57 |
|
---|
58 | dest_dir=/home/vagrant/.m2/repository/com/amazonaws/aws-java-sdk/1.7.4
|
---|
59 | if [ ! -e "$dest_dir/aws-java-sdk-1.7.4.jar" ]; then
|
---|
60 | echo "3. For cc-index-table: Copying aws-java-sdk-1.7.4.jar into $dest_dir/"
|
---|
61 | mkdir -p $dest_dir
|
---|
62 | cp hdfs-cc-work/jars/aws-java-sdk-1.7.4.jar $dest_dir/.
|
---|
63 | fi
|
---|
64 |
|
---|
65 | dest_dir=/home/vagrant/.m2/repository/org/apache/hadoop/hadoop-aws/2.7.6
|
---|
66 | if [ ! -e "$dest_dir/hadoop-aws-2.7.6.jar" ]; then
|
---|
67 | echo "4. For cc-index-table: Copying hadoop-aws-2.7.6.jar into $dest_dir/"
|
---|
68 | mkdir -p $dest_dir
|
---|
69 | cp hdfs-cc-work/jars/aws-java-sdk-1.7.4.jar $dest_dir/.
|
---|
70 | fi
|
---|
71 | fi
|
---|
72 |
|
---|
73 | echo "Done compiling and setting up."
|
---|
74 | echo "To get MRI warc to wet for a particular crawl timestamp, cd into cc-index-table and RUN:"
|
---|
75 | echo "./get_maori_WET_records_for_crawl.sh CC-MAIN-<YYYY-##>"
|
---|
76 | echo " where YYYY is the year and ## is the common crawl number for that year."
|
---|
77 |
|
---|