# INSTALL as of 04/15/2011
# Single-node setup (BEGIN)
# debian network setup, hosts weirdness
# in /etc/hosts
# (begin)
127.0.0.1 localhost.localdomain localhost
[ip0] hadoop0
[ip1] hadoop1
# (end)
apt-get install default-jre
export JAVA_HOME=/usr/lib/jvm/java-1.6.0-openjdk
export PATH=$JAVA_HOME/bin:$PATH
apt-get install eclipse
apt-get install subversion
mkdir src
cd src
svn co http://svn.apache.org/repos/asf/forrest/trunk forrest
cd forrest/main/
./build.sh
cd ../..
svn co http://svn.apache.org/repos/asf/hadoop/common/trunk common
svn co http://svn.apache.org/repos/asf/hadoop/hdfs/trunk hdfs
svn co http://svn.apache.org/repos/asf/hadoop/mapreduce/trunk mapreduce
# make sure your DISPLAY variable is set to a working X host
# export DISPLAY=[display host]:0
cd common/
ant
ant -Dforrest.home=[root install dir]/src/forrest tar
cd ..
cd hdfs
ant
ant -Dforrest.home=[root install dir]/src/forrest tar
cd ..
cd mapreduce/
ant
ant -Dforrest.home=[root install dir]/src/forrest tar
cd ..
# CONFIGURE to run
# - add to system profile ex. /etc/profile
# (begin)
export JAVA_HOME=/usr/lib/jvm/java-1.6.0-openjdk/
export PATH=$JAVA_HOME/bin:$PATH
export HADOOP_HOME=[hadoop root dir] # for example /usr/src/hadoop
export HADOOP_COMMON_HOME=$HADOOP_HOME/common/
export HADOOP_HDFS_HOME=$HADOOP_HOME/hdfs/
export HADOOP_MAPREDUCE_HOME=$HADOOP_HOME/mapreduce/
export HADOOP_CONF_DIR=$HADOOP_COMMON_HOME/conf/
# (end)
# create a hadoop user to run everything
adduser hadoop
# make hadoop user own all hadoop tree
cd [hadoop root dir/..] # for example cd /usr/src/
chown -R hadoop hadoop
#NOTE: editted $HADOOP_COMMON_HOME/bin/hadoop-config.sh
# added
# export JAVA_HOME=/usr/lib/jvm/java-1.6.0-openjdk/
# before its doing JAVA_HOME checks
# SSH SETUP for root and hadoop user
# localhost
ssh-keygen -t dsa -P '' -f ~/.ssh/id_dsa
cat ~/.ssh/id_dsa.pub >> ~/.ssh/authorized_keys
# setup/test a SINGLE NODE
# (begin)
# fix $HDFS_HOME/bin/hdfs shell file
# (begin)
# look for this
# HADOOP_OPTS="$HADOOP_OPTS -jvm server $HADOOP_DATANODE_OPTS"
# change to this
# HADOOP_OPTS="$HADOOP_OPTS -server $HADOOP_DATANODE_OPTS"
# (end)
mkdir -p /local/hdfs/tmp
# $HADOOP_COMMON_HOME/conf/core-site.xml modifications should result in a file that looks like this
# (begin)
hadoop.tmp.dir
/local/hdfs/tmp
A base for other temporary directories.
dfs.data.dir
/local/hdfs/data
dfs.data.name
/local/hdfs/name
fs.default.name
hdfs://localhost:9000
# (end)
# format hdfs
$HADOOP_COMMON_HOME/bin/hadoop namenode -format
# RUN IT
$HADOOP_COMMON_HOME/bin/start-all.sh
# should be no errors
# (end)
# Single-node setup (END)
# Master-node setup (BEGIN)
# backup the conf directories from hadoopCommon, mapreduce, hdfs
# edit /etc/hosts to include the cluster members (should have static ip settings)
# ex: only add the following similar entries, do not change the entries created by any system config, i.e. the localhost stuff
[ip0] hadoop0
[ip1] hadoop1
# exchange the ssh keys between machines enabling password-less access, including hostX<->hostX, hostY<->hostY, hostX<->hostY
# on master machine, update common/conf/masters, changing from localhost to name of master
# ex:
hadoop0
# on master machine, update common/conf/slaves, changing from localhost to name of master and all slaves
# ex:
hadoop0
hadoop1
# on all cluster member machines, updates to core-site.xml, mapred-site.xml, hdfs-site.xml (Find these files in appropriate system conf directories)
# common/conf/core-site.xml
# look for variable, fs.default.name, change value from localhost to master name
# ex: look for this section
fs.default.name
hdfs://localhost:9000
# ex: change to
fs.default.name
hdfs://hadoop0:9000
# mapreduce/conf/mapred-site.xml
# look for variable, mapred.job.tracker, change value from localhost to master name
# ex: look for this section
mapred.job.tracker
localhost:9001
# ex: change to
mapred.job.tracker
hadoop0:9001
# hdfs/conf/hdfs-site.xml
# look for variable, dfs.replication, change to number of nodes in cluster
# ex: look for this section
dfs.replication
1
# ex: change to
dfs.replication
2
# start up dfs on master, should start on slaves
# NOTE: a problem was found where the master was not starting the slaves. this was caused by the install locations of the master and slave being different.
# followup configuration changes, some performance mods especially for hbase
# modify hbase-env.sh in the [hbase_root]/conf directory
# export JAVA_HOME to the same java setting for hadoop config
# edit /etc/security/limits.conf
# add the following lines
hadoop - nofile 32768
hadoop soft/hard nproc 32768
# edit /etc/pam.d/common-session
# add the following line
session required pam_limits.so
# 05/12/2011 - change to user running the system
# in hdfs/conf/hdfs-site.xml, add user hadoop belonging to group root, this changes the webuser/webgroup default setting described below
# hadoop should be a normal user added to the file system
# hadoop user should own /usr/src/hadoop and whereever you are running built binaries from
# hadoop user should own /local/hdfs or whatever your local file system is that is used in hadoop
# use the format command as hadoop user
# restart everything, any permission errors on logs during format or startup imply hadoop user doesnt own the offending file
dfs.web.ugi
hadoop,root
# hadoop security is not fully understood yet and causing problems in clustered subapps
# hence, we disable at least hdfs security which is looking for 'webuser' belonging to 'webgroup', which happens to be hardcoded in the system ARGGGGGGGGGGGGGGGGG!
# edit $HDFS_HOME/conf/hdfs-site.xml, add
# (begin)
dfs.permissions
no
# (end)
# hbase specific configuration
# edit $HBASE_ROOT_DIR/conf/hbase-site.xml
hbase.rootdir
hdfs://hadoop0:9000/hbase
The directory shared by region servers.
hbase.cluster.distributed
true
The mode the cluster will be in. Possible values are
false: standalone and pseudo-distributed setups with managed Zookeeper
true: fully-distributed with unmanaged Zookeeper Quorum (see hbase-env.sh)
# edit $HBASE_ROOT_DIR/conf/regionservers, similar to slaves file
hadoop0
hadoop1
# edit $HADOOP_COMMON_HOME/conf/hadoop-env.sh
export HADOOP_CLASSPATH=$HBASE_HOME/build/hbase-X.X.X.jar:$HBASE_HOME/build/hbase-X.X.X-test.jar:$HBASE_HOME/conf:${HBASE_HOME}/lib/zookeeper-X.X.X.jar
# mahout
# could not build version 3 or version 2 mahout
# downloaded version 3 binary and installed according to instructions
# mahout configuration
# add to /etc/profile like all other environment settings in this readme
export MAHOUT_HOME=/usr/src/mahout
# with the setup described in this document, the mahout examples incorrectly use HADOOP_HOME in all scripts/files and should instead refer to HADOOP_COMMON_HOME
# pay attention to the actual hdfs file path and alter examples accordingly