Show
Ignore:
Timestamp:
25.10.2016 10:02:58 (3 years ago)
Author:
davidb
Message:

Renaming to better represent what the cluster is designed for

Location:
other-projects/hathitrust/vagrant-spark-hdfs-cluster
Files:
3 modified
1 moved

Legend:

Unmodified
Added
Removed
  • other-projects/hathitrust/vagrant-spark-hdfs-cluster/trunk/README.txt

    r30905 r30913  
    11 
    2 Vargrant provisioning files to spin up a modest (4 node) Hadoop 
    3 cluster for experiments processing HTRC Extracted Feature JSON files 
    4 suitable for ingesting into Solr. 
     2Vargrant provisioning files to spin up a modest Spark cluster (master 
     3+ 3 slaves + backup) for experiments processing HTRC Extracted Feature 
     4JSON files suitable for ingesting into Solr. 
     5 
     6To aid parallelism, code is designed to read JSON files from HDFS, so 
     7the provision of the cluster includes Hadoop core in addition to Spark 
    58 
    69 
    7 Top-level code Apache Spark, processing HDFS stored JSON files, hence 
    8 the need for an underlying Hadoop cluster. 
     10Provisioning uses Puppet scripting, based on the following on-line 
     11resources, but updated to use newer versions of Ubuntu, Java, 
     12and Hadoop.  Spark is then added in on top of that. 
    913 
    10 Provisioning based on the following online resources, but updated to 
    11 use newer versions of Ubuntu, Java, and Hadoop. 
    1214 
    1315  http://cscarioni.blogspot.co.nz/2012/09/setting-up-hadoop-virtual-cluster-with.html 
    1416 
    1517  https://github.com/calo81/vagrant-hadoop-cluster 
     18 
     19To get everything setup, type: 
     20 
     21  vargrant up 
     22 
     23Then log in to the master node, and swithc to 'ubuntu' user 
     24 
     25  vargrant ssh master 
     26  sudo su - ubuntu 
     27 
     28If the first time, you need to format an HDFS area to use: 
     29  hdfs namenode -format 
     30 
     31Otherwise start up HDFS and Spark deamon processes: 
     32 
     33  start-dfs.sh 
     34  spark-start-all.sh 
     35 
     36You can visit the Spark cluster monitoring page at: 
     37 
     38  http://10.10.0.52:8080/ 
     39 
    1640 
    1741 
     
    6084http://kvz.io/blog/2013/01/16/vagrant-tip-keep-virtualbox-guest-additions-in-sync/ 
    6185 
     86---- 
     87SecondaryNode 
     88---- 
     89 
     90http://stackoverflow.com/questions/23581425/hadoop-how-to-start-secondary-namenode-on-other-node 
     91 
     92<property> 
     93  <name>dfs.namenode.secondary.http-address</name> 
     94  <value>ec2-54-187-222-213.us-west-2.compute.amazonaws.com:50090</value> 
     95</property> 
     96 
     97---- 
     98Spark Cluster 
     99---- 
     100 
     101http://spark.apache.org/docs/latest/spark-standalone.html 
  • other-projects/hathitrust/vagrant-spark-hdfs-cluster/trunk/manifests/base-hadoop.pp

    r30903 r30913  
    11$install_dir = "/opt" 
    22$hadoop_home = "${install_dir}/hadoop" 
     3$spark_home  = "${install_dir}/spark" 
     4 
    35$user = "ubuntu" 
    46$group = "ubuntu" 
     7 
    58$hadoop_master = '10.10.0.52' 
    69$hadoop_backup = '10.10.0.51' 
     
    1013 
    1114include hadoop 
     15include spark 
    1216 
    1317file {  
     
    4650} 
    4751 
    48    
     52#package { "subversion" : 
     53#   ensure => present, 
     54#  require => [ Exec['apt-get update'], File["/home/${user}"] ] 
     55#} 
     56 
     57#package { "maven" : 
     58#   ensure => present, 
     59#  require => [ Exec['apt-get update'], File["/home/${user}"] ] 
     60#} 
     61 
    4962file {  
    5063    "/home/${user}/.ssh": 
    5164    ensure => "directory", 
    5265    owner  => "${user}", 
    53     group  => "${user}", 
     66    group  => "${group}", 
    5467    mode   => 750, 
    5568    require => [ Exec['apt-get update'], File["/home/${user}"] ] 
  • other-projects/hathitrust/vagrant-spark-hdfs-cluster/trunk/modules/hadoop/manifests/init.pp

    r30903 r30913  
    22 
    33exec { "download_hadoop": 
    4 # Download from nearby mirror, otherwise task can time-out 
    5 command => "wget -O /tmp/hadoop.tar.gz http://apache.mirror.amaze.com.au/hadoop/common/hadoop-2.7.3/hadoop-2.7.3.tar.gz", 
    6 # command => "wget -O /tmp/hadoop.tar.gz http://www-eu.apache.org/dist/hadoop/common/hadoop-2.7.3/hadoop-2.7.3.tar.gz", 
    7 path => $path, 
    8 user => $user, 
    9 group => $user, 
    10 unless => "ls ${install_dir} | grep hadoop-2.7.3", 
    11 require => Package["openjdk-7-jdk"] 
    12 } 
     4  # Download from nearby mirror, otherwise task can time-out 
     5  command => "wget -O /tmp/hadoop-2.7.3.tar.gz http://apache.mirror.amaze.com.au/hadoop/common/hadoop-2.7.3/hadoop-2.7.3.tar.gz", 
     6  # command => "wget -O /tmp/hadoop-2.7.3.tar.gz http://www-eu.apache.org/dist/hadoop/common/hadoop-2.7.3/hadoop-2.7.3.tar.gz", 
     7  path => $path, 
     8  user => $user, 
     9  group => $user, 
     10  unless => "ls ${install_dir} | grep hadoop-2.7.3", 
     11  require => Package["openjdk-7-jdk"] 
     12 } 
    1313 
    1414exec { "unpack_hadoop" : 
    15   command => "tar -zxf /tmp/hadoop.tar.gz -C ${install_dir}", 
     15  command => "tar -zxf /tmp/hadoop-2.7.3.tar.gz -C ${install_dir}", 
    1616  path => $path, 
    1717  creates => "${hadoop_home}-2.7.3", 
    1818  require => Exec["download_hadoop"] 
    19 } 
     19 } 
    2020 
    21 exec { 'chown': 
    22   command => "/bin/chown -R ${user}:${group} /opt/hadoop-2.7.3", 
     21exec { "rename_hadoop" : 
     22  command => "ln -s ${install_dir}/hadoop-2.7.3 ${install_dir}/hadoop", 
     23  path => $path, 
     24  creates => "${hadoop_home}", 
     25  require => Exec["unpack_hadoop"] 
     26 } 
     27 
     28exec { 'chown_hadoop': 
     29  command => "/bin/chown -R ${user}:${group} ${hadoop_home}-2.7.3", 
    2330  path => '/bin', 
    2431  user => 'root', 
    25   require => Exec["unpack_hadoop"] 
    26 } 
     32  require => Exec["rename_hadoop"] 
     33 } 
    2734 
    28 file { 
    29   "${hadoop_home}-2.7.3/etc/hadoop/slaves": 
     35file {  
     36  "${hadoop_home}/etc/hadoop/slaves": 
    3037  content => template('hadoop/slaves'), 
    3138  mode => 644, 
    3239  owner => $user, 
    3340  group => $group, 
    34   require => Exec["unpack_hadoop"] 
     41  require => Exec["chown_hadoop"] 
    3542 } 
    3643  
    37 file { 
    38   "${hadoop_home}-2.7.3/etc/hadoop/masters": 
     44file {  
     45  "${hadoop_home}/etc/hadoop/masters": 
    3946  content => template('hadoop/masters'), 
    4047  mode => 644, 
    4148  owner => $user, 
    4249  group => $group, 
    43   require => Exec["unpack_hadoop"] 
     50  require => Exec["chown_hadoop"] 
    4451 } 
    4552 
    4653file { 
    47   "${hadoop_home}-2.7.3/etc/hadoop/core-site.xml": 
     54  "${hadoop_home}/etc/hadoop/core-site.xml": 
    4855  content => template('hadoop/core-site.xml'), 
    4956  mode => 644, 
    5057  owner => $user, 
    5158  group => $group, 
    52   require => Exec["unpack_hadoop"] 
     59  require => Exec["chown_hadoop"] 
    5360 } 
    5461  
    5562file { 
    56   "${hadoop_home}-2.7.3/etc/hadoop/mapred-site.xml": 
     63  "${hadoop_home}/etc/hadoop/mapred-site.xml": 
    5764  content => template('hadoop/mapred-site.xml'), 
    5865  mode => 644, 
    5966  owner => $user, 
    6067  group => $group, 
    61   require => Exec["unpack_hadoop"] 
     68  require => Exec["chown_hadoop"] 
    6269 } 
    6370  
    6471 file { 
    65   "${hadoop_home}-2.7.3/etc/hadoop/hdfs-site.xml": 
     72  "${hadoop_home}/etc/hadoop/hdfs-site.xml": 
    6673  content => template('hadoop/hdfs-site.xml'), 
    6774  mode => 644, 
    6875  owner => $user, 
    6976  group => $group, 
    70   require => Exec["unpack_hadoop"] 
     77  require => Exec["chown_hadoop"] 
    7178 } 
    7279  
    7380file { 
    74   "${hadoop_home}-2.7.3/etc/hadoop/hadoop-env.sh": 
     81  "${hadoop_home}/etc/hadoop/hadoop-env.sh": 
    7582  content => template('hadoop/hadoop-env.sh'), 
    7683  mode => 644, 
    7784  owner => $user, 
    7885  group => $group, 
    79   require => Exec["unpack_hadoop"] 
     86  require => Exec["chown_hadoop"] 
    8087 } 
    8188 
    82 file{ 
    83    [ "${hadoop_home}-2.7.3/hadoop_store", 
    84      "${hadoop_home}-2.7.3/hadoop_store/hdfs", 
    85      "${hadoop_home}-2.7.3/hadoop_store/hdfs/namenode", 
    86      "${hadoop_home}-2.7.3/hadoop_store/hdfs/datanode"]: 
     89file { 
     90   [ "${hadoop_home}/hadoop_store", 
     91     "${hadoop_home}/hadoop_store/hdfs", 
     92     "${hadoop_home}/hadoop_store/hdfs/namenode", 
     93     "${hadoop_home}/hadoop_store/hdfs/datanode"]: 
    8794   ensure => 'directory', 
    8895   owner => "${user}", 
    8996   group => "${group}", 
    9097   mode => 755, 
    91    require => Exec["unpack_hadoop"] 
     98   require => Exec["chown_hadoop"] 
    9299  } 
    93100 
    94 file_line { "add_hadoop_home": 
     101file { 
     102  "/home/${user}/.bashrc-setup-hadoop": 
     103  content => template('hadoop/setup-hadoop.bash'), 
     104  mode => 644, 
     105  owner => $user, 
     106  group => $group, 
     107  require => [  Exec["unpack_hadoop"], File["/home/${user}"] ] 
     108 } 
     109 
     110file_line { "setup_hadoop_home": 
    95111    ensure => present, 
    96112    path => "/home/${user}/.bashrc", 
    97     line => "export HADOOP_HOME=\"${hadoop_home}-2.7.3\"", 
     113    line => ". .bashrc-setup-hadoop", 
    98114    require => [  Exec["unpack_hadoop"], File["/home/${user}"] ]     
    99 } 
     115 } 
    100116 
    101 file_line { "add_hadoop_confdir": 
    102     ensure => present, 
    103     path => "/home/${user}/.bashrc", 
    104     line => 'export HADOOP_CONF_DIR="$HADOOP_HOME/etc/hadoop"', 
    105     require => [  Exec["unpack_hadoop"], File["/home/${user}"] ]     
    106 } 
    107117 
    108 file_line { "add_hadoop_setup": 
    109     ensure => present, 
    110     path => "/home/${user}/.bashrc", 
    111     line => 'source "$HADOOP_HOME/etc/hadoop/hadoop-env.sh"', 
    112     require => [  Exec["unpack_hadoop"], File["/home/${user}"] ] 
    113 } 
    114  
    115 file_line { "add_hadoop_path": 
    116     ensure => present, 
    117     path => "/home/${user}/.bashrc", 
    118     line => 'export PATH="$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin"', 
    119     require => [  Exec["unpack_hadoop"], File["/home/${user}"] ]     
    120 } 
    121118 
    122119}