如果主服务器宕机,造成的损失是不可估量的。要保证主服务器不间断服务,就需要对服务器实现冗余。在众多的实现服务器冗余的解决方案 中,heartbeat为我们提供了廉价的、可伸缩的高可用集群方案。我们通过heartbeat+drbd在Linux下创建一个高可用(HA)的集群 服务器。
DRBD 是一种块设备,可以被用于高可用(HA)之中。它类似于一个网络RAID-1功能。当你将数据写入本地文件系统时,数据还将会被发送到网络中另一台主机上。以相同的形式记录在一个文件系统中。本地(主节点)与远程主机(备节点)的数据可以保证实时同步。当本地系统出现故障时,远程主机上还会保留有一份相同的数据,可以继续使用。在高可用(HA)中使用DRBD功能,可以代替使用一个共享盘阵。因为数据同时存在于本地主机和远程主机上。切换时,远程主机只要使用它上面的那份备份数据,就可以继续进行服务了。
服务器地址说明:drdb主服务器地址192.168.1.2 主机名master。drdb从属服务器地址192.168.1.3, 主机名slave。 虚拟ip 192.168.1.10
虚拟操作系统:redhat 企业版5.4。
1 、主服务器配置:
1 、固定ip地址:
[root@master ~]# setup
[root@master ~]# service network restart
2 、修改hosts文件:
[root@manage ~]# echo "192.168.1.2 master ">>/etc/hosts
[root@manage ~]# echo "192.168.1.2 slave ">>/etc/hosts
3 、编辑yum客户端:
[root@master ~]# mkdir /mnt/cdrom
[root@master ~]# mount /dev/cdrom /mnt/cdrom/
[root@master ~]# vim /etc/yum.repos.d/rhel-debuginfo.repo
编辑的内容:
[rhel-server]
name=Red Hat Enterprise Linux server
baseurl=file:///mnt/cdrom/Server
enabled=1
gpgcheck=1
gpgkey=file:///mnt/cdrom/RPM-GPG-KEY-redhat-release
[rhel-cluster]
name=Red Hat Enterprise Linux cluster
baseurl=file:///mnt/cdrom/Cluster
enabled=1
gpgcheck=1
gpgkey=file:///mnt/cdrom/RPM-GPG-KEY-redhat-release
[rhel-clusterstorage]
name=Red Hat Enterprise Linux clusterstorage
baseurl=file:///mnt/cdrom/ClusterStorage
enabled=1
gpgcheck=1
gpgkey=file:///mnt/cdrom/RPM-GPG-KEY-redhat-release
4 、新建分区:
[root@master ~]# fdisk /dev/sda
The number of cylinders for this disk is set to 1958.
There is nothing wrong with that, but this is larger than 1024,
and could in certain setups cause problems with:
1) software that runs at boot time (e.g., old versions of LILO)
2) booting and partitioning software from other OSs
(e.g., DOS FDISK, OS/2 FDISK)
Command (m for help): n # 增加新分区
Command action
e extended
p primary partition (1-4)
p # 增加主分区
Selected partition 4
First cylinder (328-1958, default 328): # 默认柱面,直接回车
Using default value 328
Last cylinder or +size or +sizeM or +sizeK (328-1958, default 1958): +1G # 大小为1G
Command (m for help): w # 保存退出
The partition table has been altered!
Calling ioctl() to re-read partition table.
WARNING: Re-reading the partition table failed with error 16: Device or resource busy.
The kernel still uses the old table.
The new table will be used at the next reboot.
Syncing disks.
[root@master ~]#partprobe /dev/sda
查看分区:
[root@master ~]#cat /proc/partitions
5 、安装drdb:
[root@master ~]# yum localinstall drbd83-8.3.8-1.el5.centos.i386.rpm kmod-drbd83-8.3.8-1.el5.centos.i686.rpm –nogpgcheck -y
加载DRBD 模块
[root@master ~]# modprobe drbd
查看模块加载
[root@master ~]# lsmod |grep drbd
drbd 228528 0
编辑配置文件
[root@master ~]# vim /etc/drbd.conf
在底行模式下输入“r /usr/share/doc/drbd83-8.3.8/drbd.conf”。
在从属服务器执行相同的操作;
[root@master ~]# cd /etc/drbd.d/
[root@master drbd.d]# cp global_common.conf global_common.conf.bak
[root@master drbd.d]# vim global_common.conf
内容为:
global {
usage-count no;
# minor-count dialog-refresh disable-ip-verification
}
common {
protocol C;
startup {
wfc-timeout 120;
degr-wfc-timeout 120;
}
disk {
on-io-error detach;
fencing resource-only;
}
net {
cram-hmac-alg "sha1";
shared-secret "mydrbdlab";
}
syncer {
rate 100M;
}
}
定义资源:
名字为web.res
编辑
[root@master drbd.d]# vim web.res
resource web {
on master {
device /dev/drbd0;
disk /dev/sda4;
address 192.168.1.2:7898;
meta-disk internal;
}
on slave {
device /dev/drbd0;
disk /dev/sda4;
address 192.168.1.3:7898;
meta-disk internal;
}
}
将此两个文件拷贝到从属服务器中(地址为192.168.1.3)
[root@master drbd.d]# scp global_common.conf 192.168.1.3:/etc/drbd.d/
[root@master drbd.d]# scp web.res 192.168.1.3:/etc/drbd.d/
6 、检测配置文件
[root@master drbd.d]# drbdadm adjust web
drbdsetup 0 show:5: delay-probe-volume 0k => 0k out of range [4..1048576]k.
7 、创建web 的资源
[root@master drbd.d]# drbdadm create-md web
Writing meta data...
initializing activity log
NOT initialized bitmap
New drbd meta data block successfully created.
启动DRBD 服务
[root@master drbd.d]#service drbd start
7 、将一些文件拷贝到slave(192.168.1.3)中:
先拷贝hosts文件:
[root@master ~]# scp /etc/hosts 192.168.1.3:/etc/
The authenticity of host '192.168.1.3 (192.168.1.3)' can't be established.
RSA key fingerprint is d4:f1:06:3b:a0:81:fd:85:65:20:9e:a1:ee:46:a6:8b.
Are you sure you want to continue connecting (yes/no)? yes
Warning: Permanently added '192.168.1.3' (RSA) to the list of known hosts.
root@192.168.1.3's password: # 输入slave的管理员密码
拷贝yum客户端:
[root@master ~]# scp /etc/yum.repos.d/rhel-debuginfo.repo 192.168.1.3:/etc/yum.repos.d/
root@192.168.1.3's password: # 输入slave管理员的密码
拷贝drdb安装包:
[root@master ~]# scp *.rpm 192.168.1.3:/root
root@192.168.1.3's password: # 输入slave管理员的密码
2 、从属服务器配置:
1 、新建分区:(注意新建分区大小与主服务器一样)
[root@slave ~]# fdisk /dev/sda
The number of cylinders for this disk is set to 2610.
There is nothing wrong with that, but this is larger than 1024,
and could in certain setups cause problems with:
1) software that runs at boot time (e.g., old versions of LILO)
2) booting and partitioning software from other OSs
(e.g., DOS FDISK, OS/2 FDISK)
Command (m for help): n # 增加新分区
Command action
e extended
p primary partition (1-4)
p # 增加主分区
Selected partition 4
First cylinder (1580-2610, default 1580): # 默认柱面,直接回车
Using default value 1580
Last cylinder or +size or +sizeM or +sizeK (1580-2610, default 2610): +1G # 大小为1G
Command (m for help): W # 保存退出
The partition table has been altered!
Calling ioctl() to re-read partition table.
WARNING: Re-reading the partition table failed with error 16: Device or resource busy.
The kernel still uses the old table.
The new table will be used at the next reboot.
Syncing disks.
[root@slave ~]# partprobe /dev/sda
2 、安装drdb:
[root@slave ~]# mkdir /mnt/cdrom/
[root@slave ~]# mount /dev/cdrom /mnt/cdrom/
[root@slave ~]#yum localinstall drbd83-8.3.8-1.el5.centos.i386.rpm kmod-drbd83-8.3.8-1.el5.centos.i686.rpm –nogpgcheck -y
[root@slave ~]# cp /usr/share/doc/drbd83-8.3.8/drbd.conf /etc/
3 、检测配置文件
[root@slave drbd.d]# drbdadm adjust web
4 、创建web 的资源
[root@slave drbd.d]# drbdadm create-md web
Writing meta data...
initializing activity log
NOT initialized bitmap
New drbd meta data block successfully created.
3 、接下来在两台服务器上同时操作
1 、在主服务器和从属服务器上同时启动drbd服务:
主服务器 :[root@master drbd.d]# service drbd start
从属服务器 :[root@slave drbd.d]# service drbd start
2 、查看两台服务器上的drbd状态:
master :
[root@master ~]# drbd-overview
0:web Connected Secondary/Secondary Inconsistent/Inconsistent C r----
[root@master drbd.d]# cat /proc/drbd
version: 8.3.8 (api:88/proto:86-94)
GIT-hash: d78846e52224fd00562f7c225bcc25b2d422321d build by mockbuild@builder10.centos.org, 2010-06-04 08:04:16
0: cs:Connected ro:Secondary/Secondary ds:Inconsistent/Inconsistent C r----
ns:0 nr:0 dw:0 dr:0 al:0 bm:0 lo:0 pe:0 ua:0 ap:0 ep:1 wo:b oos:987928
slave
[root@slave ~]# drbd-overview
0:web Connected Secondary/Secondary Inconsistent/Inconsistent C r----
[root@slave drbd.d]# cat /proc/drbd
version: 8.3.8 (api:88/proto:86-94)
GIT-hash: d78846e52224fd00562f7c225bcc25b2d422321d build by mockbuild@builder10.centos.org, 2010-06-04 08:04:16
0: cs:Connected ro:Secondary/Secondary ds:Inconsistent/Inconsistent C r----
ns:0 nr:0 dw:0 dr:0 al:0 bm:0 lo:0 pe:0 ua:0 ap:0 ep:1 wo:b oos:987928
可知两台服务器都为Secondary状态,证明还没有同步。
创建文件夹
[root@master ~]# mkdir /data
[root@slave ~]# mkdir /data
3 、在master上操作:
[root@master ~]# cd /etc/drbd.d/
[root@master drbd.d]# drbdadm -- --overwrite-data-of-peer primary web
[root@master drbd.d]# drbd-overview
0:web SyncSource Primary/Secondary UpToDate/Inconsistent C r----
[====>...............] sync'ed: 26.5% (732280/987928)K delay_probe: 25
格式化:
[root@master drbd.d]# mkfs -t ext3 -L drbdweb /dev/drbd0
挂载:
[root@master drbd.d]# mkdir /mnt/1
[root@master drbd.d]# mount /dev/drbd0 /mnt/1
[root@master drbd.d]# df -h
Filesystem Size Used Avail Use% Mounted on
/dev/sda2 9.7G 2.6G 6.7G 28% /
/dev/sda1 99M 12M 83M 12% /boot
tmpfs 97M 0 97M 0% /dev/shm
/dev/hdc 2.8G 2.8G 0 100% /mnt/cdrom
/dev/drbd0 950M 18M 885M 2% /mnt/1
再次查看两台服务器的状态:
master :
[root@master drbd.d]# service drbd status
drbd driver loaded OK; device status:
version: 8.3.8 (api:88/proto:86-94)
GIT-hash: d78846e52224fd00562f7c225bcc25b2d422321d build by mockbuild@builder10.centos.org, 2010-06-04 08:04:16
m:res cs ro ds p mounted fstype
0:web Connected Primary/Secondary UpToDate/UpToDate C /mnt/1 ext3
slave
[root@slave drbd.d]# service drbd status
drbd driver loaded OK; device status:
version: 8.3.8 (api:88/proto:86-94)
GIT-hash: d78846e52224fd00562f7c225bcc25b2d422321d build by mockbuild@builder10.centos.org, 2010-06-04 08:04:16
m:res cs ro ds p mounted fstype
0:web Connected Secondary/Primary UpToDate/UpToDate C
可知master为主服务,salve为从属,此时已经同步。
4 、NFS配置:
两台服务器都修改nfs 配置文件如下:
[root@master drbd.d]# vim /etc/exports
/data *(rw,sync,insecure,no_root_squash,no_wdelay)
两台服务器都启动服务并设为开机自启动:
service portmap start && chkconfig portmap on
service nfs start && chkconfig nfs on
两台服务器都修改nfs 启动脚本。将/etc/init.d/nfs 脚本中的stop 部分中的killproc
nfsd -2 修改为 -9
5 、Heartbeat配置
在两台服务器上都操作:
yum localinstall heartbeat-2.1.4-9.el5.i386.rpm heartbeat-pils-2.1.4-10.el5.i386.rpm heartbeat-stonith-2.1.4-10.el5.i386.rpm libnet-1.1.4-3.el5.i386.rpm perl-MailTools-1.77-1.el5.noarch.rpm --nogpgcheck
拷贝配置文档:
主服务器
[root@master ~]# cd /etc/ha.d/
[root@master ha.d]# cp /usr/share/doc/heartbeat-2.1.4/ha.cf ./
[root@master ha.d]# cp /usr/share/doc/heartbeat-2.1.4/haresources ./
[root@master ha.d]# cp /usr/share/doc/heartbeat-2.1.4/authkeys ./
[root@master ha.d]# vim ha.cf
需要打开或修改以下几行:
24 行 debugfile /var/log/ha-debug
29 行 logfile /var/log/ha-log
34 行 logfacility local0
48 行 keepalive 2
56 行 deadtime 10
76 行 udpport 694
121 行 ucast eth0 192.168.1.3 #修改为对方的地址。
220 行 ping 192.168.1.1
157 行 auto_failback off #设为关闭状态。
在212行下面添加以下两行:
node master
node slave
在另一台服务器上修改的ha.cf中只有121行不一样。改为对方的地址192.168.1.2
配置haresources,2台机子相同:
echo "master IPaddr::192.168.1.10/24/eth0 drbddisk::web Filesystem::/dev/drbd0::/data::ext3 killnfsd" >> /etc/ha.d/haresources
authkeys 配置相同:
auth 1
1 crc
#2 sha1 HI!
#3 md5 Hello!
在/etc/ha.d/resource.d目录下创建文件killnfsd并编辑(在两台服务器编辑一样)
echo "killall -9 nfsd;/etc/init.d/nfs restart;exit 0" >> /etc/ha.d/resource.d/killnfsd
设置文档权限:
chmod 600 /etc/ha.d/authkeys
chmod 755 /etc/ha.d/resource.d/killnfsd
开启Heartbeat服务
master :
[root@master ha.d]# service heartbeat start
slave :
[root@slave ha.d]# service heartbeat start
6 、测试
先查看在heartbeat服务运行的情况下drbd的状态:
master :
[root@master ~]# drbd-overview
0:web Connected Primary/Secondary UpToDate/UpToDate C r---- /data ext3 950M 18M 885M 2%
slave :
[root@slave ha.d]# drbd-overview
0:web Connected Secondary/Primary UpToDate/UpToDate C r----
可知master为主服务器,slave为备份服务
把master上的heartbeat服务停止:
[root@master ~]# service heartbeat stop
再查看drbd的状态:
master :
[root@master ~]# drbd-overview
0:web Connected Secondary/Primary UpToDate/UpToDate C r----
slave :
[root@slave ha.d]# drbd-overview
0:web Connected Primary/Secondary UpToDate/UpToDate C r---- /data ext3 950M 18M 885M 2%
可知master为备份服务器,slave为主服务
至此,slave 接管服务成功,实验已实现所需的功能。