15 Ceph 高階引數配置(轉載)
Ceph
高階引數配置
調整名稱空間
cd cluster/examples/kubernetes/ceph export ROOK_OPERATOR_NAMESPACE="rook-ceph" export ROOK_CLUSTER_NAMESPACE="rook-ceph" sed -i.bak \ -e "s/\(.*\):.*# namespace:operator/\1: $ROOK_OPERATOR_NAMESPACE # namespace:operator/g" \ -e "s/\(.*\):.*# namespace:cluster/\1: $ROOK_CLUSTER_NAMESPACE # namespace:cluster/g" \ -e "s/\(.*serviceaccount\):.*:\(.*\) # serviceaccount:namespace:operator/\1:$ROOK_OPERATOR_NAMESPACE:\2 # serviceaccount:namespace:operator/g" \ -e "s/\(.*serviceaccount\):.*:\(.*\) # serviceaccount:namespace:cluster/\1:$ROOK_CLUSTER_NAMESPACE:\2 # serviceaccount:namespace:cluster/g" \ -e "s/\(.*\): [-_A-Za-z0-9]*\.\(.*\) # driver:namespace:operator/\1: $ROOK_OPERATOR_NAMESPACE.\2 # driver:namespace:operator/g" \ -e "s/\(.*\): [-_A-Za-z0-9]*\.\(.*\) # driver:namespace:cluster/\1: $ROOK_CLUSTER_NAMESPACE.\2 # driver:namespace:cluster/g" \ common.yaml operator.yaml cluster.yaml # add other files or change these as desired for your config # You need to use `apply` for all Ceph clusters after the first if you have only one Operator kubectl apply -f common.yaml -f operator.yaml -f cluster.yaml # add other files as desired for yourconfig
採集容器日誌
[root@m1 rbd]# cat /tmp/log.sh #!/bin/bash log_path="/var/log/rook" DATE=`date +%F` mkdir -pv log_path for p in $(kubectl -n rook-ceph get pods -o jsonpath='{.items[*].metadata.name}') do for c in $(kubectl -n rook-ceph get pod ${p} -o jsonpath='{.spec.containers[*].name}') do echo "BEGIN logs from pod: ${p} ${c}" kubectl -n rook-ceph logs -c ${c} ${p} | tee ${log_path}/${c}_${p}_${DATE}.logs echo "END logs from pod: ${p} ${c}" done done EOF
[root@m1 rbd]# ls /var/log/rook/ -lh total 11M -rw-r--r-- 1 root root 64 Dec 2 16:35 ceph-crash_rook-ceph-crashcollector-192.168.100.133-778bbd9bc5-slv77_2022-12-02.logs -rw-r--r-- 1 root root 329 Dec 2 16:35 ceph-crash_rook-ceph-crashcollector-192.168.100.134-55bffbcd86-8db2m_2022-12-02.logs -rw-r--r-- 1 root root 64 Dec 2 16:35 ceph-crash_rook-ceph-crashcollector-192.168.100.135-568bff4f85-ftdvx_2022-12-02.logs -rw-r--r-- 1 root root 64 Dec 2 16:35 ceph-crash_rook-ceph-crashcollector-192.168.100.136-55fdc6f5bd-kqtgh_2022-12-02.logs -rw-r--r-- 1 root root 64 Dec 2 16:35 ceph-crash_rook-ceph-crashcollector-192.168.100.137-9c7cb5f7-svz9z_2022-12-02.logs -rw-r--r-- 1 root root 517 Dec 2 16:35 csi-attacher_csi-cephfsplugin-provisioner-8658f67749-jxshb_2022-12-02.logs -rw-r--r-- 1 root root 517 Dec 2 16:34 csi-attacher_csi-cephfsplugin-provisioner-8658f67749-jxshb_.logs -rw-r--r-- 1 root root 734 Dec 2 16:35 csi-attacher_csi-cephfsplugin-provisioner-8658f67749-whmrx_2022-12-02.logs -rw-r--r-- 1 root root 1.0K Dec 2 16:35 csi-attacher_csi-rbdplugin-provisioner-94f699d86-bh4fv_2022-12-02.logs -rw-r--r-- 1 root root 728 Dec 2 16:35 csi-attacher_csi-rbdplugin-provisioner-94f699d86-p6vpm_2022-12-02.logs ......
採集 OSD
資訊
Ceph
裡面包含有很多的配置資訊,這些配置資訊可以通過 socket 的方式去獲取到,如 ceph --admin-daemon /var/run/ceph/mon-node-1.socket config show
即可獲取到 m1
這個 mon
的配置資訊,同時可以通過 config set,config get
的方式臨時配置和獲取到配置引數資訊, rook
將節點容器化之後我們需要進入到相關的容器中去獲取資訊,如下
[root@m1 rbd]# kubectl -n rook-ceph exec -it rook-ceph-mon-a-6cfc46ccd8-xrmzn -- bash
[root@rook-ceph-mon-a-6cfc46ccd8-xrmzn /]# ceph --admin-daemon /var/run/ceph/ceph-mon.a.asok config show | head
{
"name": "mon.a",
"cluster": "ceph",
"admin_socket": "/var/run/ceph/ceph-mon.a.asok",
"admin_socket_mode": "",
"allow_ansi": "Terminal",
"auth_client_required": "cephx, none",
"auth_cluster_required": "cephx",
"auth_debug": "false",
"auth_mon_ticket_ttl": "43200.000000",
同樣,OSD
作為一個容器的形式執行在叢集中,可以進入到容器中去檢視相關的資訊
[root@m1 rbd]# kubectl -n rook-ceph exec -it rook-ceph-osd-1-5866f9f558-jq994 -- bash
[root@rook-ceph-osd-1-5866f9f558-jq994 /]# ls -l /var/lib/ceph/osd/ceph-1/
total 28
lrwxrwxrwx 1 ceph ceph 93 Dec 1 05:29 block -> /dev/ceph-4a84daae-a3c8-40a3-822b-100c8e47d8d2/osd-block-da31a738-a678-4aa1-b238-5ce69ba5aaa0
-rw------- 1 ceph ceph 37 Dec 1 05:29 ceph_fsid
-rw------- 1 ceph ceph 37 Dec 1 05:29 fsid
-rw------- 1 ceph ceph 55 Dec 1 05:29 keyring
-rw------- 1 ceph ceph 6 Dec 1 05:29 ready
-rw------- 1 ceph ceph 3 Dec 1 05:29 require_osd_release
-rw------- 1 ceph ceph 10 Dec 1 05:29 type
-rw------- 1 ceph ceph 2 Dec 1 05:29 whoami
[root@rook-ceph-osd-1-5866f9f558-jq994 /]# lsblk -f
NAME FSTYPE LABEL UUID MOUNTPOINT
sda
|-sda1 xfs fef3e1da-62e1-41eb-b1cb-19021cce0cf5
`-sda2 LVM2_member wzFi9m-QD1s-G4Yi-SBN4-7ud9-8wYk-NGx1dS
|-centos-root xfs 9886a31d-694b-4957-ad05-247edc04dd88 /var/lib/ceph/osd/ceph-1
`-centos-swap swap 239e607c-1a07-44d5-ae89-1b415e160ff8
sdb LVM2_member 3EQj3n-Gkb8-aDVN-iTJj-JrpZ-mhjU-Yyph3G
`-ceph--4a84daae--a3c8--40a3--822b--100c8e47d8d2-osd--block--da31a738--a678--4aa1--b238--5ce69ba5aaa0
sr0 iso9660 CentOS 7 x86_64 2020-11-03-14-55-29-00
官方提供檢視 OSD
資訊的指令碼
# Get OSD Pods
# This uses the example/default cluster name "rook"
OSD_PODS=$(kubectl get pods --all-namespaces -l \
app=rook-ceph-osd,rook_cluster=rook-ceph -o jsonpath='{.items[*].metadata.name}')
# Find node and drive associations from OSD pods
for pod in $(echo ${OSD_PODS})
do
echo "Pod: ${pod}"
echo "Node: $(kubectl -n rook-ceph get pod ${pod} -o jsonpath='{.spec.nodeName}')"
kubectl -n rook-ceph exec ${pod} -- sh -c '\
for i in /var/lib/ceph/osd/ceph-*; do
[ -f ${i}/ready ] || continue
echo -ne "-$(basename ${i}) "
echo $(lsblk -n -o NAME,SIZE ${i}/block 2> /dev/null || \
findmnt -n -v -o SOURCE,SIZE -T ${i}) $(cat ${i}/type)
done | sort -V
echo'
done
配置 pool
引數
Ceph
提供了調整 pool
的引數,即ceph osd pool set size|pg_num|pgp_num
等引數,如調整副本數量
[root@m1 rbd]# ceph osd pool get testpool1 size
size: 3
[root@m1 rbd]# ceph osd pool set testpool1 size 2
set pool 13 size to 2
[root@m1 rbd]# ceph osd pool get testpool1 size
size: 2
除了副本數量之外, pool
還有一個很重要的引數是 PG
數量, PG
是 place group
的簡寫,和 PGP
一起代表資料的分佈情況,通過 crush
演算法將 PG
分佈到同步的 OSD
上, OSD
上的 PG
如果分佈過少的話可能會導致資料的丟失,因此需要設定一個相對合理的數值,預設已經開啟了 自動調整pg
的功能,如下
[root@m1 rbd]# ceph osd pool autoscale-status
POOL SIZE TARGET SIZE RATE RAW CAPACITY RATIO TARGET RATIO EFFECTIVE RATIO BIAS PG_NUM NEW PG_NUM AUTOSCALE
device_health_metrics 0 3.0 249.9G 0.0000 1.0 1 on
replicapool 1454M 3.0 249.9G 0.0171 1.0 32 on
myfs-metadata 3252k 3.0 249.9G 0.0000 4.0 32 on
myfs-data0 108.4M 3.0 249.9G 0.0013 1.0 32 on
my-store.rgw.control 0 3.0 249.9G 0.0000 1.0 8 on
my-store.rgw.meta 7525 3.0 249.9G 0.0000 1.0 8 on
my-store.rgw.log 222.5k 3.0 249.9G 0.0000 1.0 8 on
my-store.rgw.buckets.index 830.3k 3.0 249.9G 0.0000 1.0 8 on
my-store.rgw.buckets.non-ec 1352 3.0 249.9G 0.0000 1.0 8 on
.rgw.root 3927 3.0 249.9G 0.0000 1.0 8 on
my-store.rgw.buckets.data 3314 3.0 249.9G 0.0000 1.0 32 on
evescn_test 900.9k 3.0 249.9G 0.0000 1.0 32 on
testpool1 0 2.0 249.9G 0.0000 1.0 16 on
testpool2 0 2.0 249.9G 0.0000 1.0 16 on
Ceph
會根據資料的情況,自動調整的 PG
的的大小,一般而言生產中需要手動設定 PG
的大小,因此需要將其關閉
[root@m1 rbd]# ceph osd pool get testpool1 pg_autoscale_mode
pg_autoscale_mode: on
[root@m1 rbd]# ceph osd pool set testpool1 pg_autoscale_mode off
set pool 13 pg_autoscale_mode to off
[root@m1 rbd]# ceph osd pool get testpool1 pg_autoscale_mode
pg_autoscale_mode: off
關閉之後,則需要手動設定 PG
和 PGP
的數量
[root@m1 rbd]# ceph osd pool set testpool1 pg_num 32
set pool 13 pg_num to 32
[root@m1 rbd]# ceph osd pool set testpool1 pgp_num 32
set pool 13 pgp_num to 32
[root@m1 rbd]# ceph osd pool autoscale-status
POOL SIZE TARGET SIZE RATE RAW CAPACITY RATIO TARGET RATIO EFFECTIVE RATIO BIAS PG_NUM NEW PG_NUM AUTOSCALE
device_health_metrics 0 3.0 249.9G 0.0000 1.0 1 on
replicapool 1454M 3.0 249.9G 0.0171 1.0 32 on
myfs-metadata 3252k 3.0 249.9G 0.0000 4.0 32 on
myfs-data0 108.4M 3.0 249.9G 0.0013 1.0 32 on
my-store.rgw.control 0 3.0 249.9G 0.0000 1.0 8 on
my-store.rgw.meta 7525 3.0 249.9G 0.0000 1.0 8 on
my-store.rgw.log 222.5k 3.0 249.9G 0.0000 1.0 8 on
my-store.rgw.buckets.index 830.3k 3.0 249.9G 0.0000 1.0 8 on
my-store.rgw.buckets.non-ec 1352 3.0 249.9G 0.0000 1.0 8 on
.rgw.root 3927 3.0 249.9G 0.0000 1.0 8 on
my-store.rgw.buckets.data 3314 3.0 249.9G 0.0000 1.0 32 on
evescn_test 900.9k 3.0 249.9G 0.0000 1.0 32 on
testpool1 0 2.0 249.9G 0.0000 1.0 32 off
testpool2 0 2.0 249.9G 0.0000 1.0 32 on
調整 mon
引數
調整Ceph
的引數一般有兩種方式:
- 臨時調整,通過
config set
的方式做調整,可以線上調整Ceph
的引數,這種調整是臨時有效 - 永久生效,調整
ceph.conf
置檔案,調整後需要重啟各個元件程序
先來看一個例子,建立 pool
的時候都會分配 pg_num
和 pgp_num
數量,這個值預設是 32
[root@m1 rbd]# kubectl -n rook-ceph exec -it rook-ceph-mon-a-6cfc46ccd8-xrmzn -- bash
[root@rook-ceph-mon-a-6cfc46ccd8-xrmzn /]# ceph --admin-daemon /var/run/ceph/ceph-mon.a.asok config show | grep pg_num
"mgr_debug_aggressive_pg_num_changes": "false",
"mon_max_pool_pg_num": "65536",
"mon_warn_on_pool_pg_num_not_power_of_two": "true",
"osd_pool_default_pg_num": "32", # 每個 pool 預設的 pg_num 為 32
"rgw_rados_pool_pg_num_min": "8",
此時建立一個 pool
其預設的 pg_num
和 pgp_num
是 32
,可以通過如下的例子來驗證
[root@m1 rbd]# ceph osd pool create pool1
pool 'pool1' created
[root@m1 rbd]# ceph osd pool get pool1 pg_num
pg_num: 32
[root@m1 rbd]# ceph osd pool get pool1 pgp_num
pgp_num: 32
調整引數,將其值從 32
修改為 16
[root@rook-ceph-mon-a-6cfc46ccd8-xrmzn /]# ceph --admin-daemon /var/run/ceph/ceph-mon.a.asok config set osd_pool_default_pg_num 16
{
"success": "osd_pool_default_pg_num = '16' (not observed, change may require restart) "
}
相同的方法,修改另外兩個 mon
的引數,修改完畢後測試如下
[root@m1 rbd]# ceph osd pool create pool2
pool 'pool1' created
[root@m1 rbd]# ceph osd pool get pool2 pg_num
pg_num: 16
[root@m1 rbd]# ceph osd pool get pool2 pgp_num
pgp_num: 16
防止池誤刪除
預設可以刪除 pool
,其通過 mon_allow_pool_delete
引數的開關進行控制,預設是 true
,因此可以刪除 pool
[root@rook-ceph-mon-a-6cfc46ccd8-xrmzn /]# ceph --admin-daemon /var/run/ceph/ceph-mon.a.asok config show | grep delete
"mon_allow_pool_delete": "true",
因此可以將 pool
刪除
[root@m1 rbd]# ceph osd pool rm pool1 pool1 --yes-i-really-really-mean-it
pool 'pool1' removed
為了防止誤刪除,將開關關閉,3
個 mon
服務都需要執行
[root@rook-ceph-mon-a-6cfc46ccd8-xrmzn /]# ceph --admin-daemon /var/run/ceph/ceph-mon.a.asok config set mon_allow_pool_delete false
{
"success": "mon_allow_pool_delete = 'false' "
}
關閉之後再次刪除時,會提示 deletion
禁用而導致無法刪除
[root@m1 rbd]# ceph osd pool rm pool2 pool2 --yes-i-really-really-mean-it
Error EPERM: pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool
制定 Ceph
配置
config set
的方式是臨時有效的,如果需要使配置永久生效需要修改 ceph.conf
配置檔案,使配置能夠永久生效,在 rook
中需要通過修改 rook-config-override
這個 configmap
實現配置的管理,如:
[root@m1 rbd]# cat overide.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: rook-config-override
namespace: rook-ceph
data:
config: |
[global]
osd crush update on start = false
osd pool default size = 2
mon_allow_pool_delete = true
osd_pool_default_pg_num = 64
將其 apply
到叢集中
[root@m1 rbd]# kubectl apply -f overide.yaml
Warning: resource configmaps/rook-config-override is missing the kubectl.kubernetes.io/last-applied-configuration annotation which is required by kubectl apply. kubectl apply should only be used on resources created declaratively by either kubectl create --save-config or kubectl apply. The missing annotation will be patched automatically.
configmap/rook-config-override configured
檢視配置檔案
[root@m1 rbd]# kubectl get configmaps -n rook-ceph rook-config-override -o yaml
apiVersion: v1
data:
config: |
[global]
osd crush update on start = false
osd pool default size = 2
mon_allow_pool_delete = true
osd_pool_default_pg_num = 64
kind: ConfigMap
......
重啟元件技巧
修改 rook-config-override
之後,容器中的 configmap
會自動去讀區到 configmap
配置檔案的內容,如下
[root@rook-ceph-mon-a-6cfc46ccd8-xrmzn /]# cat /etc/ceph/ceph.conf
[global]
osd crush update on start = false
osd pool default size = 2
mon_allow_pool_delete = true
osd_pool_default_pg_num = 64
然而讀取之後其配置並未生效,如果需要使配置生效,需要將對應的元件做重啟,包括 MON,MGR,RGW,MDS,OSD
等,重啟的時候需要注意單次不要重啟一個 pods
程序,確保 pods
啟動完畢之後,結合 ceph -s
觀察狀態,待 ceph
狀態正常之後再重啟其他的程序,如下重啟 mon
的程序
[root@m1 rbd]# kubectl -n rook-ceph delete pods rook-ceph-mon-a-6cfc46ccd8-xrmzn
觀察 Ceph
叢集的狀態,待 Ceph
狀態正常之後再重啟另外的 monitor
程序
[root@m1 rbd]# ceph -s
cluster:
id: 17a413b5-f140-441a-8b35-feec8ae29521
health: HEALTH_WARN
2 daemons have recently crashed
services:
mon: 3 daemons, quorum b,d,e (age 3s)
mgr: a(active, since 61m)
mds: myfs:2 {0=myfs-d=up:active,1=myfs-b=up:active} 2 up:standby-replay
osd: 5 osds: 5 up (since 4m), 5 in (since 26h)
rgw: 2 daemons active (my.store.a, my.store.b)
task status:
data:
pools: 15 pools, 289 pgs
objects: 910 objects, 1.5 GiB
usage: 10 GiB used, 240 GiB / 250 GiB avail
pgs: 289 active+clean
io:
client: 2.1 KiB/s rd, 4 op/s rd, 0 op/s wr
此外,重啟 OSD
需要特別注意,其涉及到資料的遷移,因此需要確保重啟過程中 ceph
的狀態為 active+clean
才能繼續下一步驟的重啟,避免因為重啟而導致資料大規模移動影響正常的業務,如下是官網提供的重啟元件建議
- mons: ensure all three mons are online and healthy before restarting each mon pod, one at a time.
- mgrs: the pods are stateless and can be restarted as needed, but note that this will disrupt the Ceph dashboard during restart.
- OSDs: restart your the pods by deleting them, one at a time, and running
ceph -s
between each restart to ensure the cluster goes back to “active/clean” state. - RGW: the pods are stateless and can be restarted as needed.
- MDS: the pods are stateless and can be restarted as needed.
重啟完畢 mon,mgr,rgw,mds,osd
元件之後,檢視配置檔案,可以看到已經生效
[root@m1 rbd]# kubectl -n rook-ceph exec -it rook-ceph-mon-b-7486b4b679-hbsng -- bash
[root@rook-ceph-mon-b-7486b4b679-hbsng /]# ceph --admin-daemon /var/run/ceph/ceph-mon.b.asok config show | grep osd_pool_default_pg_num
"osd_pool_default_pg_num": "64",
[root@rook-ceph-mon-b-7486b4b679-hbsng /]# ceph --admin-daemon /var/run/ceph/ceph-mon.b.asok config show | grep "osd_pool_default_size"
"osd_pool_default_size": "2",
Ceph
調優實踐
apiVersion: v1
kind: ConfigMap
metadata:
name: rook-config-override
namespace: rook-ceph
data:
config: |
[global]
osd crush update on start = false
osd pool default size = 2
mon_allow_pool_delete = true
osd_pool_default_pg_num = 32
mon_max_pg_per_osd = 250 # 每個osd上最多PG數量,超過則告警
mon_osd_full_ratio = 0.95 # osd利用率達到95%時資料無法寫入
mon_osd_nearfull_ratio = 0.85 # 接近寫滿時告警
[osd]
osd_recovery_op_priority = 1 # osd資料恢復時優先順序,預設為3
osd_recovery_max_active = 1 # osd同時恢復時pg的數量,預設是0
osd_max_backfills = 1 # backfills資料填充的數量
osd_recovery_max_chunk = 1048576 # 恢復時資料塊大小,預設8388608
osd_scrub_begin_hour = 1 # scrub一致性校驗開始的時間,預設為0
osd_scrub_end_hour = 6 # scrub一致性校驗結束的時間,預設為24
調整 CRUSH
結構
crushmap
是 Ceph
決定資料分佈的方式,一把採用預設的 crushmap
即可,有些場景需要做調整,如:
- 資料分佈:如SSD+HDD融合環境,需要將SSD資源池和HDD資源池分開,給兩種不同的業務混合使用
- 權重分配:OSD預設會根據容量分配對應的weight,但資料不是絕對的平均,容量不平均的時候可以調整
- OSD親和力:調整OSD資料主寫的親和力機制
資料的分佈,如混合場景,調整相對複雜,可以參考此部落格
如某個 OSD
利用率過高,達到 85%
的時候會提示 nearfull
,這個時候需要擴容 OSD
到叢集中,如果其他的 OSD
利用率不高,則可以根據需要調整 OSD
的權重,觸發資料的重新分佈,如下:
[root@m1 rbd]# ceph osd df
ID CLASS WEIGHT REWEIGHT SIZE RAW USE DATA OMAP META AVAIL %USE VAR PGS STATUS
0 hdd 0.04880 1.00000 50 GiB 1.9 GiB 937 MiB 600 KiB 1023 MiB 48 GiB 3.83 0.96 172 up
1 hdd 0.04880 1.00000 50 GiB 1.9 GiB 967 MiB 732 KiB 1023 MiB 48 GiB 3.89 0.98 178 up
2 hdd 0.04880 1.00000 50 GiB 2.2 GiB 1.2 GiB 369 KiB 1024 MiB 48 GiB 4.48 1.12 160 up
3 hdd 0.04880 1.00000 50 GiB 1.9 GiB 909 MiB 2.2 MiB 1022 MiB 48 GiB 3.78 0.95 185 up
4 hdd 0.04880 1.00000 50 GiB 2.0 GiB 1001 MiB 2.1 MiB 1022 MiB 48 GiB 3.96 0.99 172 up
TOTAL 250 GiB 10 GiB 5.0 GiB 5.9 MiB 5.0 GiB 240 GiB 3.99
MIN/MAX VAR: 0.95/1.12 STDDEV: 0.25
[root@m1 rbd]# ceph osd crush reweight osd.3 0.8
reweighted item id 3 name 'osd.3' to 0.8 in crush map
[root@m1 rbd]# ceph osd df
ID CLASS WEIGHT REWEIGHT SIZE RAW USE DATA OMAP META AVAIL %USE VAR PGS STATUS
0 hdd 0.04880 1.00000 50 GiB 1.9 GiB 937 MiB 600 KiB 1023 MiB 48 GiB 3.83 0.96 173 up
1 hdd 0.04880 1.00000 50 GiB 1.9 GiB 967 MiB 732 KiB 1023 MiB 48 GiB 3.89 0.98 167 up
2 hdd 0.04880 1.00000 50 GiB 2.2 GiB 1.2 GiB 369 KiB 1024 MiB 48 GiB 4.48 1.12 170 up
3 hdd 0.79999 1.00000 50 GiB 1.9 GiB 909 MiB 2.2 MiB 1022 MiB 48 GiB 3.78 0.95 206 up
4 hdd 0.04880 1.00000 50 GiB 2.0 GiB 1001 MiB 2.1 MiB 1022 MiB 48 GiB 3.96 0.99 151 up
TOTAL 250 GiB 10 GiB 5.0 GiB 5.9 MiB 5.0 GiB 240 GiB 3.99
調整之後,會自動的做資料的 rebalance
[root@m1 rbd]# ceph -s
cluster:
id: 17a413b5-f140-441a-8b35-feec8ae29521
health: HEALTH_WARN
Degraded data redundancy: 815/2730 objects degraded (29.853%), 72 pgs degraded
2 daemons have recently crashed
services:
mon: 3 daemons, quorum b,d,e (age 8m)
mgr: a(active, since 69m)
mds: myfs:2 {0=myfs-d=up:active,1=myfs-b=up:active} 2 up:standby-replay
osd: 5 osds: 5 up (since 12m), 5 in (since 26h); 57 remapped pgs
rgw: 2 daemons active (my.store.a, my.store.b)
task status:
data:
pools: 15 pools, 289 pgs
objects: 910 objects, 1.5 GiB
usage: 10 GiB used, 240 GiB / 250 GiB avail
pgs: 815/2730 objects degraded (29.853%)
465/2730 objects misplaced (17.033%)
190 active+clean
41 active+recovery_wait+degraded
31 active+recovery_wait+undersized+degraded+remapped
24 active+remapped+backfill_wait
2 active+recovery_wait+remapped
1 active+recovering
io:
recovery: 0 B/s, 2 keys/s, 0 objects/s
定製 OSD
網路
Ceph
提供了兩個不同的網路,用於不同的功能:
-
public network
:業務網路,用於連線Ceph
叢集建立資料通道 -
cluster network
:資料網路,用於Ceph
內部的心跳,資料同步
apiVersion: v1
data:
config: |
[global]
# override配置檔案中進行設定
public network = 10.0.7.0/24
cluster network = 10.0.10.0/24
public addr = ""
cluster addr = ""
預設這兩個網路整合在一起,如果有兩張不同的網絡卡,可以將其進行分開,首先需要將網路設定為 hostNetwork
, hostNetwork
意味著容器網路和宿主機網路位於同一個網路型別,這個調整隻能在 rook
初始化叢集的時候做調整,配置位於 cluster.yaml
檔案
[root@m1 rbd]# cd ../../
[root@m1 ceph]# vim cluster.yaml
71 network:
72 # enable host networking
73 #provider: host
調整故障域
Ceph
支援設定資源池的故障域,何為故障域?故障域是指當出現異常時能容忍的範圍, Ceph
支援多種不同型別的故障域,常⻅的故障域有:
-
datacenter
:資料中心級別,如三個副本,分別落在三個不同的資料中心 -
rack
:機架級別,如三個副本,分別落在三個不同的資料機櫃 -
host
:宿主機級別,如三個副本,分別落在三個不同的宿主機,預設規則 -
osd
:磁碟級別,如三個副本,分別落在三個不同的磁碟上
建立 pool
的時候可以定義 pool
所使用的故障域,如下建立一個 pool
所使用的故障域為 osd
#修改配置
[root@m1 ceph]# grep -v "[.*#]" pool.yaml
apiVersion: ceph.rook.io/v1
kind: CephBlockPool
metadata:
name: test-domain
namespace: rook-ceph
spec:
failureDomain: osd #故障域
replicated:
size: 3
requireSafeReplicaSize: true
建立之後,可以通過如下命令進行校驗
[root@m1 ceph]# kubectl apply -f pool.yaml
cephblockpool.ceph.rook.io/test-domain created
# 檢視底層 pool 資訊
[root@m1 ceph]# ceph osd lspools
1 device_health_metrics
2 replicapool
3 myfs-metadata
4 myfs-data0
5 my-store.rgw.control
6 my-store.rgw.meta
7 my-store.rgw.log
8 my-store.rgw.buckets.index
9 my-store.rgw.buckets.non-ec
10 .rgw.root
11 my-store.rgw.buckets.data
12 evescn_test
13 testpool1
14 testpool2
16 pool2
17 test-domain
# 檢視 crush_rule 規則
[root@m1 ceph]# ceph osd pool get test-domain crush_rule
crush_rule: test-domain
# 檢視故障域型別
[root@m1 ceph]# ceph osd crush rule dump test-domain
{
"rule_id": 12,
"rule_name": "test-domain",
"ruleset": 12,
"type": 1,
"min_size": 1,
"max_size": 10,
"steps": [
{
"op": "take",
"item": -1,
"item_name": "default"
},
{
"op": "choose_firstn",
"num": 0,
"type": "osd" # 故障域型別
},
{
"op": "emit"
}
]
}
對於 osd
的故障域來說,其將資料分佈在三個不同的磁碟上,不管這三個磁碟是否落在同個宿主機上,因此存在有資料丟失的⻛險,基於各種因素需要調整故障域時,可以通過如下的方法進行調整,首先需要建立一個故障域關聯的規則
[root@m1 ceph]# ceph osd crush rule create-replicated happylau-rule default host
[root@m1 ceph]# ceph osd crush rule dump happylau-rule
{
"rule_id": 13,
"rule_name": "happylau-rule",
"ruleset": 13,
"type": 1,
"min_size": 1,
"max_size": 10,
"steps": [
{
"op": "take",
"item": -1,
"item_name": "default"
},
{
"op": "chooseleaf_firstn",
"num": 0,
"type": "host" # 建立 happylau-rule 的規則,故障域為 host
},
{
"op": "emit"
}
]
}
建立 rule
規則之後,將其應用在對應的 pool
上
[root@m1 ceph]# ceph osd pool set test-domain crush_rule happylau-rule
set pool 17 crush_rule to happylau-rule
[root@m1 ceph]# ceph osd pool get test-domain crush_rule
crush_rule: happylau-rule