Changeset 30913
- Timestamp:
- 2016-10-25T10:02:58+13:00 (7 years ago)
- Location:
- other-projects/hathitrust/vagrant-spark-hdfs-cluster
- Files:
-
- 3 edited
- 1 moved
Legend:
- Unmodified
- Added
- Removed
-
other-projects/hathitrust/vagrant-spark-hdfs-cluster/trunk/README.txt
r30905 r30913 1 1 2 Vargrant provisioning files to spin up a modest (4 node) Hadoop 3 cluster for experiments processing HTRC Extracted Feature JSON files 4 suitable for ingesting into Solr. 2 Vargrant provisioning files to spin up a modest Spark cluster (master 3 + 3 slaves + backup) for experiments processing HTRC Extracted Feature 4 JSON files suitable for ingesting into Solr. 5 6 To aid parallelism, code is designed to read JSON files from HDFS, so 7 the provision of the cluster includes Hadoop core in addition to Spark 5 8 6 9 7 Top-level code Apache Spark, processing HDFS stored JSON files, hence 8 the need for an underlying Hadoop cluster. 10 Provisioning uses Puppet scripting, based on the following on-line 11 resources, but updated to use newer versions of Ubuntu, Java, 12 and Hadoop. Spark is then added in on top of that. 9 13 10 Provisioning based on the following online resources, but updated to11 use newer versions of Ubuntu, Java, and Hadoop.12 14 13 15 http://cscarioni.blogspot.co.nz/2012/09/setting-up-hadoop-virtual-cluster-with.html 14 16 15 17 https://github.com/calo81/vagrant-hadoop-cluster 18 19 To get everything setup, type: 20 21 vargrant up 22 23 Then log in to the master node, and swithc to 'ubuntu' user 24 25 vargrant ssh master 26 sudo su - ubuntu 27 28 If the first time, you need to format an HDFS area to use: 29 hdfs namenode -format 30 31 Otherwise start up HDFS and Spark deamon processes: 32 33 start-dfs.sh 34 spark-start-all.sh 35 36 You can visit the Spark cluster monitoring page at: 37 38 http://10.10.0.52:8080/ 39 16 40 17 41 … … 60 84 http://kvz.io/blog/2013/01/16/vagrant-tip-keep-virtualbox-guest-additions-in-sync/ 61 85 86 ---- 87 SecondaryNode 88 ---- 89 90 http://stackoverflow.com/questions/23581425/hadoop-how-to-start-secondary-namenode-on-other-node 91 92 <property> 93 <name>dfs.namenode.secondary.http-address</name> 94 <value>ec2-54-187-222-213.us-west-2.compute.amazonaws.com:50090</value> 95 </property> 96 97 ---- 98 Spark Cluster 99 ---- 100 101 http://spark.apache.org/docs/latest/spark-standalone.html -
other-projects/hathitrust/vagrant-spark-hdfs-cluster/trunk/manifests/base-hadoop.pp
r30903 r30913 1 1 $install_dir = "/opt" 2 2 $hadoop_home = "${install_dir}/hadoop" 3 $spark_home = "${install_dir}/spark" 4 3 5 $user = "ubuntu" 4 6 $group = "ubuntu" 7 5 8 $hadoop_master = '10.10.0.52' 6 9 $hadoop_backup = '10.10.0.51' … … 10 13 11 14 include hadoop 15 include spark 12 16 13 17 file { … … 46 50 } 47 51 48 52 #package { "subversion" : 53 # ensure => present, 54 # require => [ Exec['apt-get update'], File["/home/${user}"] ] 55 #} 56 57 #package { "maven" : 58 # ensure => present, 59 # require => [ Exec['apt-get update'], File["/home/${user}"] ] 60 #} 61 49 62 file { 50 63 "/home/${user}/.ssh": 51 64 ensure => "directory", 52 65 owner => "${user}", 53 group => "${ user}",66 group => "${group}", 54 67 mode => 750, 55 68 require => [ Exec['apt-get update'], File["/home/${user}"] ] -
other-projects/hathitrust/vagrant-spark-hdfs-cluster/trunk/modules/hadoop/manifests/init.pp
r30903 r30913 2 2 3 3 exec { "download_hadoop": 4 # Download from nearby mirror, otherwise task can time-out5 command => "wget -O /tmp/hadoop.tar.gz http://apache.mirror.amaze.com.au/hadoop/common/hadoop-2.7.3/hadoop-2.7.3.tar.gz",6 # command => "wget -O /tmp/hadoop.tar.gz http://www-eu.apache.org/dist/hadoop/common/hadoop-2.7.3/hadoop-2.7.3.tar.gz",7 path => $path,8 user => $user,9 group => $user,10 unless => "ls ${install_dir} | grep hadoop-2.7.3",11 require => Package["openjdk-7-jdk"]12 }4 # Download from nearby mirror, otherwise task can time-out 5 command => "wget -O /tmp/hadoop-2.7.3.tar.gz http://apache.mirror.amaze.com.au/hadoop/common/hadoop-2.7.3/hadoop-2.7.3.tar.gz", 6 # command => "wget -O /tmp/hadoop-2.7.3.tar.gz http://www-eu.apache.org/dist/hadoop/common/hadoop-2.7.3/hadoop-2.7.3.tar.gz", 7 path => $path, 8 user => $user, 9 group => $user, 10 unless => "ls ${install_dir} | grep hadoop-2.7.3", 11 require => Package["openjdk-7-jdk"] 12 } 13 13 14 14 exec { "unpack_hadoop" : 15 command => "tar -zxf /tmp/hadoop .tar.gz -C ${install_dir}",15 command => "tar -zxf /tmp/hadoop-2.7.3.tar.gz -C ${install_dir}", 16 16 path => $path, 17 17 creates => "${hadoop_home}-2.7.3", 18 18 require => Exec["download_hadoop"] 19 }19 } 20 20 21 exec { 'chown': 22 command => "/bin/chown -R ${user}:${group} /opt/hadoop-2.7.3", 21 exec { "rename_hadoop" : 22 command => "ln -s ${install_dir}/hadoop-2.7.3 ${install_dir}/hadoop", 23 path => $path, 24 creates => "${hadoop_home}", 25 require => Exec["unpack_hadoop"] 26 } 27 28 exec { 'chown_hadoop': 29 command => "/bin/chown -R ${user}:${group} ${hadoop_home}-2.7.3", 23 30 path => '/bin', 24 31 user => 'root', 25 require => Exec[" unpack_hadoop"]26 }32 require => Exec["rename_hadoop"] 33 } 27 34 28 file { 29 "${hadoop_home} -2.7.3/etc/hadoop/slaves":35 file { 36 "${hadoop_home}/etc/hadoop/slaves": 30 37 content => template('hadoop/slaves'), 31 38 mode => 644, 32 39 owner => $user, 33 40 group => $group, 34 require => Exec[" unpack_hadoop"]41 require => Exec["chown_hadoop"] 35 42 } 36 43 37 file { 38 "${hadoop_home} -2.7.3/etc/hadoop/masters":44 file { 45 "${hadoop_home}/etc/hadoop/masters": 39 46 content => template('hadoop/masters'), 40 47 mode => 644, 41 48 owner => $user, 42 49 group => $group, 43 require => Exec[" unpack_hadoop"]50 require => Exec["chown_hadoop"] 44 51 } 45 52 46 53 file { 47 "${hadoop_home} -2.7.3/etc/hadoop/core-site.xml":54 "${hadoop_home}/etc/hadoop/core-site.xml": 48 55 content => template('hadoop/core-site.xml'), 49 56 mode => 644, 50 57 owner => $user, 51 58 group => $group, 52 require => Exec[" unpack_hadoop"]59 require => Exec["chown_hadoop"] 53 60 } 54 61 55 62 file { 56 "${hadoop_home} -2.7.3/etc/hadoop/mapred-site.xml":63 "${hadoop_home}/etc/hadoop/mapred-site.xml": 57 64 content => template('hadoop/mapred-site.xml'), 58 65 mode => 644, 59 66 owner => $user, 60 67 group => $group, 61 require => Exec[" unpack_hadoop"]68 require => Exec["chown_hadoop"] 62 69 } 63 70 64 71 file { 65 "${hadoop_home} -2.7.3/etc/hadoop/hdfs-site.xml":72 "${hadoop_home}/etc/hadoop/hdfs-site.xml": 66 73 content => template('hadoop/hdfs-site.xml'), 67 74 mode => 644, 68 75 owner => $user, 69 76 group => $group, 70 require => Exec[" unpack_hadoop"]77 require => Exec["chown_hadoop"] 71 78 } 72 79 73 80 file { 74 "${hadoop_home} -2.7.3/etc/hadoop/hadoop-env.sh":81 "${hadoop_home}/etc/hadoop/hadoop-env.sh": 75 82 content => template('hadoop/hadoop-env.sh'), 76 83 mode => 644, 77 84 owner => $user, 78 85 group => $group, 79 require => Exec[" unpack_hadoop"]86 require => Exec["chown_hadoop"] 80 87 } 81 88 82 file {83 [ "${hadoop_home} -2.7.3/hadoop_store",84 "${hadoop_home} -2.7.3/hadoop_store/hdfs",85 "${hadoop_home} -2.7.3/hadoop_store/hdfs/namenode",86 "${hadoop_home} -2.7.3/hadoop_store/hdfs/datanode"]:89 file { 90 [ "${hadoop_home}/hadoop_store", 91 "${hadoop_home}/hadoop_store/hdfs", 92 "${hadoop_home}/hadoop_store/hdfs/namenode", 93 "${hadoop_home}/hadoop_store/hdfs/datanode"]: 87 94 ensure => 'directory', 88 95 owner => "${user}", 89 96 group => "${group}", 90 97 mode => 755, 91 require => Exec[" unpack_hadoop"]98 require => Exec["chown_hadoop"] 92 99 } 93 100 94 file_line { "add_hadoop_home": 101 file { 102 "/home/${user}/.bashrc-setup-hadoop": 103 content => template('hadoop/setup-hadoop.bash'), 104 mode => 644, 105 owner => $user, 106 group => $group, 107 require => [ Exec["unpack_hadoop"], File["/home/${user}"] ] 108 } 109 110 file_line { "setup_hadoop_home": 95 111 ensure => present, 96 112 path => "/home/${user}/.bashrc", 97 line => " export HADOOP_HOME=\"${hadoop_home}-2.7.3\"",113 line => ". .bashrc-setup-hadoop", 98 114 require => [ Exec["unpack_hadoop"], File["/home/${user}"] ] 99 }115 } 100 116 101 file_line { "add_hadoop_confdir":102 ensure => present,103 path => "/home/${user}/.bashrc",104 line => 'export HADOOP_CONF_DIR="$HADOOP_HOME/etc/hadoop"',105 require => [ Exec["unpack_hadoop"], File["/home/${user}"] ]106 }107 117 108 file_line { "add_hadoop_setup":109 ensure => present,110 path => "/home/${user}/.bashrc",111 line => 'source "$HADOOP_HOME/etc/hadoop/hadoop-env.sh"',112 require => [ Exec["unpack_hadoop"], File["/home/${user}"] ]113 }114 115 file_line { "add_hadoop_path":116 ensure => present,117 path => "/home/${user}/.bashrc",118 line => 'export PATH="$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin"',119 require => [ Exec["unpack_hadoop"], File["/home/${user}"] ]120 }121 118 122 119 }
Note:
See TracChangeset
for help on using the changeset viewer.