# INSTALL as of 04/15/2011 # Single-node setup (BEGIN) # debian network setup, hosts weirdness # in /etc/hosts # (begin) 127.0.0.1 localhost.localdomain localhost [ip0] hadoop0 [ip1] hadoop1 # (end) apt-get install default-jre export JAVA_HOME=/usr/lib/jvm/java-1.6.0-openjdk export PATH=$JAVA_HOME/bin:$PATH apt-get install eclipse apt-get install subversion mkdir src cd src svn co http://svn.apache.org/repos/asf/forrest/trunk forrest cd forrest/main/ ./build.sh cd ../.. svn co http://svn.apache.org/repos/asf/hadoop/common/trunk common svn co http://svn.apache.org/repos/asf/hadoop/hdfs/trunk hdfs svn co http://svn.apache.org/repos/asf/hadoop/mapreduce/trunk mapreduce # make sure your DISPLAY variable is set to a working X host # export DISPLAY=[display host]:0 cd common/ ant ant -Dforrest.home=[root install dir]/src/forrest tar cd .. cd hdfs ant ant -Dforrest.home=[root install dir]/src/forrest tar cd .. cd mapreduce/ ant ant -Dforrest.home=[root install dir]/src/forrest tar cd .. # CONFIGURE to run # - add to system profile ex. /etc/profile # (begin) export JAVA_HOME=/usr/lib/jvm/java-1.6.0-openjdk/ export PATH=$JAVA_HOME/bin:$PATH export HADOOP_HOME=[hadoop root dir] # for example /usr/src/hadoop export HADOOP_COMMON_HOME=$HADOOP_HOME/common/ export HADOOP_HDFS_HOME=$HADOOP_HOME/hdfs/ export HADOOP_MAPREDUCE_HOME=$HADOOP_HOME/mapreduce/ export HADOOP_CONF_DIR=$HADOOP_COMMON_HOME/conf/ # (end) # create a hadoop user to run everything adduser hadoop # make hadoop user own all hadoop tree cd [hadoop root dir/..] # for example cd /usr/src/ chown -R hadoop hadoop #NOTE: editted $HADOOP_COMMON_HOME/bin/hadoop-config.sh # added # export JAVA_HOME=/usr/lib/jvm/java-1.6.0-openjdk/ # before its doing JAVA_HOME checks # SSH SETUP for root and hadoop user # localhost ssh-keygen -t dsa -P '' -f ~/.ssh/id_dsa cat ~/.ssh/id_dsa.pub >> ~/.ssh/authorized_keys # setup/test a SINGLE NODE # (begin) # fix $HDFS_HOME/bin/hdfs shell file # (begin) # look for this # HADOOP_OPTS="$HADOOP_OPTS -jvm server $HADOOP_DATANODE_OPTS" # change to this # HADOOP_OPTS="$HADOOP_OPTS -server $HADOOP_DATANODE_OPTS" # (end) mkdir -p /local/hdfs/tmp # $HADOOP_COMMON_HOME/conf/core-site.xml modifications should result in a file that looks like this # (begin) hadoop.tmp.dir /local/hdfs/tmp A base for other temporary directories. dfs.data.dir /local/hdfs/data dfs.data.name /local/hdfs/name fs.default.name hdfs://localhost:9000 # (end) # format hdfs $HADOOP_COMMON_HOME/bin/hadoop namenode -format # RUN IT $HADOOP_COMMON_HOME/bin/start-all.sh # should be no errors # (end) # Single-node setup (END) # Master-node setup (BEGIN) # backup the conf directories from hadoopCommon, mapreduce, hdfs # edit /etc/hosts to include the cluster members (should have static ip settings) # ex: only add the following similar entries, do not change the entries created by any system config, i.e. the localhost stuff [ip0] hadoop0 [ip1] hadoop1 # exchange the ssh keys between machines enabling password-less access, including hostX<->hostX, hostY<->hostY, hostX<->hostY # on master machine, update common/conf/masters, changing from localhost to name of master # ex: hadoop0 # on master machine, update common/conf/slaves, changing from localhost to name of master and all slaves # ex: hadoop0 hadoop1 # on all cluster member machines, updates to core-site.xml, mapred-site.xml, hdfs-site.xml (Find these files in appropriate system conf directories) # common/conf/core-site.xml # look for variable, fs.default.name, change value from localhost to master name # ex: look for this section fs.default.name hdfs://localhost:9000 # ex: change to fs.default.name hdfs://hadoop0:9000 # mapreduce/conf/mapred-site.xml # look for variable, mapred.job.tracker, change value from localhost to master name # ex: look for this section mapred.job.tracker localhost:9001 # ex: change to mapred.job.tracker hadoop0:9001 # hdfs/conf/hdfs-site.xml # look for variable, dfs.replication, change to number of nodes in cluster # ex: look for this section dfs.replication 1 # ex: change to dfs.replication 2 # start up dfs on master, should start on slaves # NOTE: a problem was found where the master was not starting the slaves. this was caused by the install locations of the master and slave being different. # followup configuration changes, some performance mods especially for hbase # modify hbase-env.sh in the [hbase_root]/conf directory # export JAVA_HOME to the same java setting for hadoop config # edit /etc/security/limits.conf # add the following lines hadoop - nofile 32768 hadoop soft/hard nproc 32768 # edit /etc/pam.d/common-session # add the following line session required pam_limits.so # 05/12/2011 - change to user running the system # in hdfs/conf/hdfs-site.xml, add user hadoop belonging to group root, this changes the webuser/webgroup default setting described below # hadoop should be a normal user added to the file system # hadoop user should own /usr/src/hadoop and whereever you are running built binaries from # hadoop user should own /local/hdfs or whatever your local file system is that is used in hadoop # use the format command as hadoop user # restart everything, any permission errors on logs during format or startup imply hadoop user doesnt own the offending file dfs.web.ugi hadoop,root # hadoop security is not fully understood yet and causing problems in clustered subapps # hence, we disable at least hdfs security which is looking for 'webuser' belonging to 'webgroup', which happens to be hardcoded in the system ARGGGGGGGGGGGGGGGGG! # edit $HDFS_HOME/conf/hdfs-site.xml, add # (begin) dfs.permissions no # (end) # hbase specific configuration # edit $HBASE_ROOT_DIR/conf/hbase-site.xml hbase.rootdir hdfs://hadoop0:9000/hbase The directory shared by region servers. hbase.cluster.distributed true The mode the cluster will be in. Possible values are false: standalone and pseudo-distributed setups with managed Zookeeper true: fully-distributed with unmanaged Zookeeper Quorum (see hbase-env.sh) # edit $HBASE_ROOT_DIR/conf/regionservers, similar to slaves file hadoop0 hadoop1 # edit $HADOOP_COMMON_HOME/conf/hadoop-env.sh export HADOOP_CLASSPATH=$HBASE_HOME/build/hbase-X.X.X.jar:$HBASE_HOME/build/hbase-X.X.X-test.jar:$HBASE_HOME/conf:${HBASE_HOME}/lib/zookeeper-X.X.X.jar # mahout # could not build version 3 or version 2 mahout # downloaded version 3 binary and installed according to instructions # mahout configuration # add to /etc/profile like all other environment settings in this readme export MAHOUT_HOME=/usr/src/mahout # with the setup described in this document, the mahout examples incorrectly use HADOOP_HOME in all scripts/files and should instead refer to HADOOP_COMMON_HOME # pay attention to the actual hdfs file path and alter examples accordingly