1. 程式人生 > 其它 >15 Ceph 高階引數配置(轉載)

15 Ceph 高階引數配置(轉載)

目錄

Ceph 高階引數配置

調整名稱空間

參考官網地址

cd cluster/examples/kubernetes/ceph

export ROOK_OPERATOR_NAMESPACE="rook-ceph"
export ROOK_CLUSTER_NAMESPACE="rook-ceph"

sed -i.bak \
    -e "s/\(.*\):.*# namespace:operator/\1: $ROOK_OPERATOR_NAMESPACE # namespace:operator/g" \
    -e "s/\(.*\):.*# namespace:cluster/\1: $ROOK_CLUSTER_NAMESPACE # namespace:cluster/g" \
    -e "s/\(.*serviceaccount\):.*:\(.*\) # serviceaccount:namespace:operator/\1:$ROOK_OPERATOR_NAMESPACE:\2 # serviceaccount:namespace:operator/g" \
    -e "s/\(.*serviceaccount\):.*:\(.*\) # serviceaccount:namespace:cluster/\1:$ROOK_CLUSTER_NAMESPACE:\2 # serviceaccount:namespace:cluster/g" \
    -e "s/\(.*\): [-_A-Za-z0-9]*\.\(.*\) # driver:namespace:operator/\1: $ROOK_OPERATOR_NAMESPACE.\2 # driver:namespace:operator/g" \
    -e "s/\(.*\): [-_A-Za-z0-9]*\.\(.*\) # driver:namespace:cluster/\1: $ROOK_CLUSTER_NAMESPACE.\2 # driver:namespace:cluster/g" \
  common.yaml operator.yaml cluster.yaml # add other files or change these as desired for your config

# You need to use `apply` for all Ceph clusters after the first if you have only one Operator
kubectl apply -f common.yaml -f operator.yaml -f cluster.yaml # add other files as desired for yourconfig

採集容器日誌

參考官網地址

[root@m1 rbd]# cat /tmp/log.sh
#!/bin/bash

log_path="/var/log/rook"
DATE=`date +%F`

mkdir -pv log_path

for p in $(kubectl -n rook-ceph get pods -o jsonpath='{.items[*].metadata.name}')
do
    for c in $(kubectl -n rook-ceph get pod ${p} -o jsonpath='{.spec.containers[*].name}')
    do
        echo "BEGIN logs from pod: ${p} ${c}"
        kubectl -n rook-ceph logs -c ${c} ${p} | tee ${log_path}/${c}_${p}_${DATE}.logs
        echo "END logs from pod: ${p} ${c}"
    done
done
EOF
[root@m1 rbd]# ls /var/log/rook/ -lh
total 11M
-rw-r--r-- 1 root root   64 Dec  2 16:35 ceph-crash_rook-ceph-crashcollector-192.168.100.133-778bbd9bc5-slv77_2022-12-02.logs
-rw-r--r-- 1 root root  329 Dec  2 16:35 ceph-crash_rook-ceph-crashcollector-192.168.100.134-55bffbcd86-8db2m_2022-12-02.logs
-rw-r--r-- 1 root root   64 Dec  2 16:35 ceph-crash_rook-ceph-crashcollector-192.168.100.135-568bff4f85-ftdvx_2022-12-02.logs
-rw-r--r-- 1 root root   64 Dec  2 16:35 ceph-crash_rook-ceph-crashcollector-192.168.100.136-55fdc6f5bd-kqtgh_2022-12-02.logs
-rw-r--r-- 1 root root   64 Dec  2 16:35 ceph-crash_rook-ceph-crashcollector-192.168.100.137-9c7cb5f7-svz9z_2022-12-02.logs
-rw-r--r-- 1 root root  517 Dec  2 16:35 csi-attacher_csi-cephfsplugin-provisioner-8658f67749-jxshb_2022-12-02.logs
-rw-r--r-- 1 root root  517 Dec  2 16:34 csi-attacher_csi-cephfsplugin-provisioner-8658f67749-jxshb_.logs
-rw-r--r-- 1 root root  734 Dec  2 16:35 csi-attacher_csi-cephfsplugin-provisioner-8658f67749-whmrx_2022-12-02.logs
-rw-r--r-- 1 root root 1.0K Dec  2 16:35 csi-attacher_csi-rbdplugin-provisioner-94f699d86-bh4fv_2022-12-02.logs
-rw-r--r-- 1 root root  728 Dec  2 16:35 csi-attacher_csi-rbdplugin-provisioner-94f699d86-p6vpm_2022-12-02.logs
......

採集 OSD 資訊

Ceph 裡面包含有很多的配置資訊,這些配置資訊可以通過 socket 的方式去獲取到,如 ceph --admin-daemon /var/run/ceph/mon-node-1.socket config show 即可獲取到 m1 這個 mon 的配置資訊,同時可以通過 config set,config get 的方式臨時配置和獲取到配置引數資訊, rook 將節點容器化之後我們需要進入到相關的容器中去獲取資訊,如下

[root@m1 rbd]# kubectl -n rook-ceph exec -it rook-ceph-mon-a-6cfc46ccd8-xrmzn -- bash
[root@rook-ceph-mon-a-6cfc46ccd8-xrmzn /]# ceph --admin-daemon /var/run/ceph/ceph-mon.a.asok config show | head
{
    "name": "mon.a",
    "cluster": "ceph",
    "admin_socket": "/var/run/ceph/ceph-mon.a.asok",
    "admin_socket_mode": "",
    "allow_ansi": "Terminal",
    "auth_client_required": "cephx, none",
    "auth_cluster_required": "cephx",
    "auth_debug": "false",
    "auth_mon_ticket_ttl": "43200.000000",

同樣,OSD 作為一個容器的形式執行在叢集中,可以進入到容器中去檢視相關的資訊

[root@m1 rbd]# kubectl -n rook-ceph exec -it rook-ceph-osd-1-5866f9f558-jq994 -- bash
[root@rook-ceph-osd-1-5866f9f558-jq994 /]# ls -l /var/lib/ceph/osd/ceph-1/
total 28
lrwxrwxrwx 1 ceph ceph 93 Dec  1 05:29 block -> /dev/ceph-4a84daae-a3c8-40a3-822b-100c8e47d8d2/osd-block-da31a738-a678-4aa1-b238-5ce69ba5aaa0
-rw------- 1 ceph ceph 37 Dec  1 05:29 ceph_fsid
-rw------- 1 ceph ceph 37 Dec  1 05:29 fsid
-rw------- 1 ceph ceph 55 Dec  1 05:29 keyring
-rw------- 1 ceph ceph  6 Dec  1 05:29 ready
-rw------- 1 ceph ceph  3 Dec  1 05:29 require_osd_release
-rw------- 1 ceph ceph 10 Dec  1 05:29 type
-rw------- 1 ceph ceph  2 Dec  1 05:29 whoami
[root@rook-ceph-osd-1-5866f9f558-jq994 /]# lsblk -f
NAME                                                                              FSTYPE      LABEL           UUID                                   MOUNTPOINT
sda                                                                                                                                                  
|-sda1                                                                            xfs                         fef3e1da-62e1-41eb-b1cb-19021cce0cf5   
`-sda2                                                                            LVM2_member                 wzFi9m-QD1s-G4Yi-SBN4-7ud9-8wYk-NGx1dS 
  |-centos-root                                                                   xfs                         9886a31d-694b-4957-ad05-247edc04dd88   /var/lib/ceph/osd/ceph-1
  `-centos-swap                                                                   swap                        239e607c-1a07-44d5-ae89-1b415e160ff8   
sdb                                                                               LVM2_member                 3EQj3n-Gkb8-aDVN-iTJj-JrpZ-mhjU-Yyph3G 
`-ceph--4a84daae--a3c8--40a3--822b--100c8e47d8d2-osd--block--da31a738--a678--4aa1--b238--5ce69ba5aaa0
                                                                                                                                                     
sr0                                                                               iso9660     CentOS 7 x86_64 2020-11-03-14-55-29-00    

官方提供檢視 OSD 資訊的指令碼

參考官網地址

# Get OSD Pods
# This uses the example/default cluster name "rook"
OSD_PODS=$(kubectl get pods --all-namespaces -l \
  app=rook-ceph-osd,rook_cluster=rook-ceph -o jsonpath='{.items[*].metadata.name}')

# Find node and drive associations from OSD pods
for pod in $(echo ${OSD_PODS})
do
 echo "Pod:  ${pod}"
 echo "Node: $(kubectl -n rook-ceph get pod ${pod} -o jsonpath='{.spec.nodeName}')"
 kubectl -n rook-ceph exec ${pod} -- sh -c '\
  for i in /var/lib/ceph/osd/ceph-*; do
    [ -f ${i}/ready ] || continue
    echo -ne "-$(basename ${i}) "
    echo $(lsblk -n -o NAME,SIZE ${i}/block 2> /dev/null || \
    findmnt -n -v -o SOURCE,SIZE -T ${i}) $(cat ${i}/type)
  done | sort -V
  echo'
done

配置 pool 引數

Ceph 提供了調整 pool 的引數,即ceph osd pool set size|pg_num|pgp_num等引數,如調整副本數量

[root@m1 rbd]# ceph osd pool get testpool1 size
size: 3
[root@m1 rbd]# ceph osd pool set testpool1 size 2
set pool 13 size to 2
[root@m1 rbd]# ceph osd pool get testpool1 size
size: 2

除了副本數量之外, pool 還有一個很重要的引數是 PG 數量, PGplace group 的簡寫,和 PGP 一起代表資料的分佈情況,通過 crush 演算法將 PG 分佈到同步的 OSD 上, OSD 上的 PG 如果分佈過少的話可能會導致資料的丟失,因此需要設定一個相對合理的數值,預設已經開啟了 自動調整pg 的功能,如下

[root@m1 rbd]# ceph osd pool autoscale-status
POOL                           SIZE  TARGET SIZE  RATE  RAW CAPACITY   RATIO  TARGET RATIO  EFFECTIVE RATIO  BIAS  PG_NUM  NEW PG_NUM  AUTOSCALE  
device_health_metrics            0                 3.0        249.9G  0.0000                                  1.0       1              on         
replicapool                   1454M                3.0        249.9G  0.0171                                  1.0      32              on         
myfs-metadata                 3252k                3.0        249.9G  0.0000                                  4.0      32              on         
myfs-data0                   108.4M                3.0        249.9G  0.0013                                  1.0      32              on         
my-store.rgw.control             0                 3.0        249.9G  0.0000                                  1.0       8              on         
my-store.rgw.meta             7525                 3.0        249.9G  0.0000                                  1.0       8              on         
my-store.rgw.log             222.5k                3.0        249.9G  0.0000                                  1.0       8              on         
my-store.rgw.buckets.index   830.3k                3.0        249.9G  0.0000                                  1.0       8              on         
my-store.rgw.buckets.non-ec   1352                 3.0        249.9G  0.0000                                  1.0       8              on         
.rgw.root                     3927                 3.0        249.9G  0.0000                                  1.0       8              on         
my-store.rgw.buckets.data     3314                 3.0        249.9G  0.0000                                  1.0      32              on         
evescn_test                  900.9k                3.0        249.9G  0.0000                                  1.0      32              on         
testpool1                        0                 2.0        249.9G  0.0000                                  1.0      16              on         
testpool2                        0                 2.0        249.9G  0.0000                                  1.0      16              on  

Ceph 會根據資料的情況,自動調整的 PG 的的大小,一般而言生產中需要手動設定 PG 的大小,因此需要將其關閉

[root@m1 rbd]# ceph osd pool get testpool1 pg_autoscale_mode
pg_autoscale_mode: on

[root@m1 rbd]# ceph osd pool set testpool1 pg_autoscale_mode off
set pool 13 pg_autoscale_mode to off

[root@m1 rbd]# ceph osd pool get testpool1 pg_autoscale_mode
pg_autoscale_mode: off

關閉之後,則需要手動設定 PGPGP 的數量

[root@m1 rbd]# ceph osd pool set testpool1 pg_num 32
set pool 13 pg_num to 32

[root@m1 rbd]# ceph osd pool set testpool1 pgp_num 32
set pool 13 pgp_num to 32

[root@m1 rbd]# ceph osd pool autoscale-status
POOL                           SIZE  TARGET SIZE  RATE  RAW CAPACITY   RATIO  TARGET RATIO  EFFECTIVE RATIO  BIAS  PG_NUM  NEW PG_NUM  AUTOSCALE  
device_health_metrics            0                 3.0        249.9G  0.0000                                  1.0       1              on         
replicapool                   1454M                3.0        249.9G  0.0171                                  1.0      32              on         
myfs-metadata                 3252k                3.0        249.9G  0.0000                                  4.0      32              on         
myfs-data0                   108.4M                3.0        249.9G  0.0013                                  1.0      32              on         
my-store.rgw.control             0                 3.0        249.9G  0.0000                                  1.0       8              on         
my-store.rgw.meta             7525                 3.0        249.9G  0.0000                                  1.0       8              on         
my-store.rgw.log             222.5k                3.0        249.9G  0.0000                                  1.0       8              on         
my-store.rgw.buckets.index   830.3k                3.0        249.9G  0.0000                                  1.0       8              on         
my-store.rgw.buckets.non-ec   1352                 3.0        249.9G  0.0000                                  1.0       8              on         
.rgw.root                     3927                 3.0        249.9G  0.0000                                  1.0       8              on         
my-store.rgw.buckets.data     3314                 3.0        249.9G  0.0000                                  1.0      32              on         
evescn_test                  900.9k                3.0        249.9G  0.0000                                  1.0      32              on         
testpool1                        0                 2.0        249.9G  0.0000                                  1.0      32              off        
testpool2                        0                 2.0        249.9G  0.0000                                  1.0      32              on   

調整 mon 引數

調整Ceph的引數一般有兩種方式:

  • 臨時調整,通過 config set 的方式做調整,可以線上調整 Ceph 的引數,這種調整是臨時有效
  • 永久生效,調整 ceph.conf 置檔案,調整後需要重啟各個元件程序

先來看一個例子,建立 pool 的時候都會分配 pg_numpgp_num 數量,這個值預設是 32

[root@m1 rbd]# kubectl -n rook-ceph exec -it rook-ceph-mon-a-6cfc46ccd8-xrmzn -- bash

[root@rook-ceph-mon-a-6cfc46ccd8-xrmzn /]# ceph --admin-daemon /var/run/ceph/ceph-mon.a.asok config show | grep pg_num
    "mgr_debug_aggressive_pg_num_changes": "false",
    "mon_max_pool_pg_num": "65536",
    "mon_warn_on_pool_pg_num_not_power_of_two": "true",
    "osd_pool_default_pg_num": "32",    # 每個 pool 預設的 pg_num 為 32
    "rgw_rados_pool_pg_num_min": "8",

此時建立一個 pool 其預設的 pg_numpgp_num32 ,可以通過如下的例子來驗證

[root@m1 rbd]# ceph osd pool create pool1
pool 'pool1' created
[root@m1 rbd]# ceph osd pool get pool1 pg_num
pg_num: 32
[root@m1 rbd]# ceph osd pool get pool1 pgp_num
pgp_num: 32

調整引數,將其值從 32 修改為 16

[root@rook-ceph-mon-a-6cfc46ccd8-xrmzn /]# ceph --admin-daemon /var/run/ceph/ceph-mon.a.asok config set osd_pool_default_pg_num 16
{
    "success": "osd_pool_default_pg_num = '16' (not observed, change may require restart) "
}

相同的方法,修改另外兩個 mon 的引數,修改完畢後測試如下

[root@m1 rbd]# ceph osd pool create pool2
pool 'pool1' created
[root@m1 rbd]# ceph osd pool get pool2 pg_num
pg_num: 16
[root@m1 rbd]# ceph osd pool get pool2 pgp_num
pgp_num: 16

防止池誤刪除

預設可以刪除 pool ,其通過 mon_allow_pool_delete 引數的開關進行控制,預設是 true ,因此可以刪除 pool

[root@rook-ceph-mon-a-6cfc46ccd8-xrmzn /]# ceph --admin-daemon /var/run/ceph/ceph-mon.a.asok config show | grep delete
    "mon_allow_pool_delete": "true",

因此可以將 pool 刪除

[root@m1 rbd]# ceph osd pool rm pool1 pool1 --yes-i-really-really-mean-it
pool 'pool1' removed

為了防止誤刪除,將開關關閉,3mon 服務都需要執行

[root@rook-ceph-mon-a-6cfc46ccd8-xrmzn /]# ceph --admin-daemon /var/run/ceph/ceph-mon.a.asok config set mon_allow_pool_delete false
{
    "success": "mon_allow_pool_delete = 'false' "
}

關閉之後再次刪除時,會提示 deletion 禁用而導致無法刪除

[root@m1 rbd]# ceph osd pool rm pool2 pool2 --yes-i-really-really-mean-it
Error EPERM: pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool

制定 Ceph 配置

config set 的方式是臨時有效的,如果需要使配置永久生效需要修改 ceph.conf 配置檔案,使配置能夠永久生效,在 rook 中需要通過修改 rook-config-override 這個 configmap 實現配置的管理,如:

[root@m1 rbd]# cat overide.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: rook-config-override
  namespace: rook-ceph
data:
  config: |
    [global]
    osd crush update on start = false
    osd pool default size = 2
    mon_allow_pool_delete = true
    osd_pool_default_pg_num = 64

將其 apply 到叢集中

[root@m1 rbd]# kubectl apply -f overide.yaml
Warning: resource configmaps/rook-config-override is missing the kubectl.kubernetes.io/last-applied-configuration annotation which is required by kubectl apply. kubectl apply should only be used on resources created declaratively by either kubectl create --save-config or kubectl apply. The missing annotation will be patched automatically.
configmap/rook-config-override configured

檢視配置檔案

[root@m1 rbd]# kubectl get configmaps -n rook-ceph rook-config-override -o yaml
apiVersion: v1
data:
  config: |
    [global]
    osd crush update on start = false
    osd pool default size = 2
    mon_allow_pool_delete = true
    osd_pool_default_pg_num = 64
kind: ConfigMap
......

重啟元件技巧

修改 rook-config-override 之後,容器中的 configmap 會自動去讀區到 configmap 配置檔案的內容,如下

[root@rook-ceph-mon-a-6cfc46ccd8-xrmzn /]# cat /etc/ceph/ceph.conf 
[global]
osd crush update on start = false
osd pool default size = 2
mon_allow_pool_delete = true
osd_pool_default_pg_num = 64

然而讀取之後其配置並未生效,如果需要使配置生效,需要將對應的元件做重啟,包括 MON,MGR,RGW,MDS,OSD 等,重啟的時候需要注意單次不要重啟一個 pods 程序,確保 pods 啟動完畢之後,結合 ceph -s 觀察狀態,待 ceph 狀態正常之後再重啟其他的程序,如下重啟 mon 的程序

[root@m1 rbd]# kubectl -n rook-ceph delete pods rook-ceph-mon-a-6cfc46ccd8-xrmzn 

觀察 Ceph 叢集的狀態,待 Ceph 狀態正常之後再重啟另外的 monitor 程序

[root@m1 rbd]# ceph -s
  cluster:
    id:     17a413b5-f140-441a-8b35-feec8ae29521
    health: HEALTH_WARN
            2 daemons have recently crashed
 
  services:
    mon: 3 daemons, quorum b,d,e (age 3s)
    mgr: a(active, since 61m)
    mds: myfs:2 {0=myfs-d=up:active,1=myfs-b=up:active} 2 up:standby-replay
    osd: 5 osds: 5 up (since 4m), 5 in (since 26h)
    rgw: 2 daemons active (my.store.a, my.store.b)
 
  task status:
 
  data:
    pools:   15 pools, 289 pgs
    objects: 910 objects, 1.5 GiB
    usage:   10 GiB used, 240 GiB / 250 GiB avail
    pgs:     289 active+clean
 
  io:
    client:   2.1 KiB/s rd, 4 op/s rd, 0 op/s wr

此外,重啟 OSD 需要特別注意,其涉及到資料的遷移,因此需要確保重啟過程中 ceph 的狀態為 active+clean 才能繼續下一步驟的重啟,避免因為重啟而導致資料大規模移動影響正常的業務,如下是官網提供的重啟元件建議

  • mons: ensure all three mons are online and healthy before restarting each mon pod, one at a time.
  • mgrs: the pods are stateless and can be restarted as needed, but note that this will disrupt the Ceph dashboard during restart.
  • OSDs: restart your the pods by deleting them, one at a time, and running ceph -s between each restart to ensure the cluster goes back to “active/clean” state.
  • RGW: the pods are stateless and can be restarted as needed.
  • MDS: the pods are stateless and can be restarted as needed.

重啟完畢 mon,mgr,rgw,mds,osd 元件之後,檢視配置檔案,可以看到已經生效

[root@m1 rbd]# kubectl -n rook-ceph exec -it rook-ceph-mon-b-7486b4b679-hbsng -- bash

[root@rook-ceph-mon-b-7486b4b679-hbsng /]# ceph --admin-daemon /var/run/ceph/ceph-mon.b.asok config show | grep  osd_pool_default_pg_num
    "osd_pool_default_pg_num": "64",

[root@rook-ceph-mon-b-7486b4b679-hbsng /]# ceph --admin-daemon /var/run/ceph/ceph-mon.b.asok config show | grep "osd_pool_default_size"
    "osd_pool_default_size": "2",

Ceph 調優實踐

apiVersion: v1
kind: ConfigMap
metadata:
  name: rook-config-override
  namespace: rook-ceph
data:
  config: |
    [global]
    osd crush update on start = false
    osd pool default size = 2
    mon_allow_pool_delete = true
    osd_pool_default_pg_num = 32
    mon_max_pg_per_osd = 250            # 每個osd上最多PG數量,超過則告警 
    mon_osd_full_ratio = 0.95           # osd利用率達到95%時資料無法寫入 
    mon_osd_nearfull_ratio = 0.85       # 接近寫滿時告警
    
    [osd]
    osd_recovery_op_priority = 1        # osd資料恢復時優先順序,預設為3 
    osd_recovery_max_active = 1         # osd同時恢復時pg的數量,預設是0 
    osd_max_backfills = 1               # backfills資料填充的數量 
    osd_recovery_max_chunk = 1048576    # 恢復時資料塊大小,預設8388608 
    osd_scrub_begin_hour = 1            # scrub一致性校驗開始的時間,預設為0 
    osd_scrub_end_hour = 6              # scrub一致性校驗結束的時間,預設為24

調整 CRUSH 結構

crushmapCeph 決定資料分佈的方式,一把採用預設的 crushmap 即可,有些場景需要做調整,如:

  • 資料分佈:如SSD+HDD融合環境,需要將SSD資源池和HDD資源池分開,給兩種不同的業務混合使用
  • 權重分配:OSD預設會根據容量分配對應的weight,但資料不是絕對的平均,容量不平均的時候可以調整
  • OSD親和力:調整OSD資料主寫的親和力機制

資料的分佈,如混合場景,調整相對複雜,可以參考此部落格

如某個 OSD 利用率過高,達到 85% 的時候會提示 nearfull ,這個時候需要擴容 OSD 到叢集中,如果其他的 OSD 利用率不高,則可以根據需要調整 OSD 的權重,觸發資料的重新分佈,如下:

[root@m1 rbd]# ceph osd df
ID  CLASS  WEIGHT   REWEIGHT  SIZE     RAW USE  DATA      OMAP     META      AVAIL    %USE  VAR   PGS  STATUS
 0    hdd  0.04880   1.00000   50 GiB  1.9 GiB   937 MiB  600 KiB  1023 MiB   48 GiB  3.83  0.96  172      up
 1    hdd  0.04880   1.00000   50 GiB  1.9 GiB   967 MiB  732 KiB  1023 MiB   48 GiB  3.89  0.98  178      up
 2    hdd  0.04880   1.00000   50 GiB  2.2 GiB   1.2 GiB  369 KiB  1024 MiB   48 GiB  4.48  1.12  160      up
 3    hdd  0.04880   1.00000   50 GiB  1.9 GiB   909 MiB  2.2 MiB  1022 MiB   48 GiB  3.78  0.95  185      up
 4    hdd  0.04880   1.00000   50 GiB  2.0 GiB  1001 MiB  2.1 MiB  1022 MiB   48 GiB  3.96  0.99  172      up
                       TOTAL  250 GiB   10 GiB   5.0 GiB  5.9 MiB   5.0 GiB  240 GiB  3.99                   
MIN/MAX VAR: 0.95/1.12  STDDEV: 0.25

[root@m1 rbd]# ceph osd crush reweight osd.3 0.8
reweighted item id 3 name 'osd.3' to 0.8 in crush map

[root@m1 rbd]# ceph osd df
ID  CLASS  WEIGHT   REWEIGHT  SIZE     RAW USE  DATA      OMAP     META      AVAIL    %USE  VAR   PGS  STATUS
 0    hdd  0.04880   1.00000   50 GiB  1.9 GiB   937 MiB  600 KiB  1023 MiB   48 GiB  3.83  0.96  173      up
 1    hdd  0.04880   1.00000   50 GiB  1.9 GiB   967 MiB  732 KiB  1023 MiB   48 GiB  3.89  0.98  167      up
 2    hdd  0.04880   1.00000   50 GiB  2.2 GiB   1.2 GiB  369 KiB  1024 MiB   48 GiB  4.48  1.12  170      up
 3    hdd  0.79999   1.00000   50 GiB  1.9 GiB   909 MiB  2.2 MiB  1022 MiB   48 GiB  3.78  0.95  206      up
 4    hdd  0.04880   1.00000   50 GiB  2.0 GiB  1001 MiB  2.1 MiB  1022 MiB   48 GiB  3.96  0.99  151      up
                       TOTAL  250 GiB   10 GiB   5.0 GiB  5.9 MiB   5.0 GiB  240 GiB  3.99     

調整之後,會自動的做資料的 rebalance

[root@m1 rbd]# ceph -s
  cluster:
    id:     17a413b5-f140-441a-8b35-feec8ae29521
    health: HEALTH_WARN
            Degraded data redundancy: 815/2730 objects degraded (29.853%), 72 pgs degraded
            2 daemons have recently crashed
 
  services:
    mon: 3 daemons, quorum b,d,e (age 8m)
    mgr: a(active, since 69m)
    mds: myfs:2 {0=myfs-d=up:active,1=myfs-b=up:active} 2 up:standby-replay
    osd: 5 osds: 5 up (since 12m), 5 in (since 26h); 57 remapped pgs
    rgw: 2 daemons active (my.store.a, my.store.b)
 
  task status:
 
  data:
    pools:   15 pools, 289 pgs
    objects: 910 objects, 1.5 GiB
    usage:   10 GiB used, 240 GiB / 250 GiB avail
    pgs:     815/2730 objects degraded (29.853%)
             465/2730 objects misplaced (17.033%)
             190 active+clean
             41  active+recovery_wait+degraded
             31  active+recovery_wait+undersized+degraded+remapped
             24  active+remapped+backfill_wait
             2   active+recovery_wait+remapped
             1   active+recovering
 
  io:
    recovery: 0 B/s, 2 keys/s, 0 objects/s

定製 OSD 網路

Ceph 提供了兩個不同的網路,用於不同的功能:

  • public network :業務網路,用於連線 Ceph 叢集建立資料通道
  • cluster network :資料網路,用於 Ceph 內部的心跳,資料同步
apiVersion: v1
data:
  config: |
    [global]
    # override配置檔案中進行設定
    public network =  10.0.7.0/24
    cluster network = 10.0.10.0/24
    public addr = ""
    cluster addr = ""

預設這兩個網路整合在一起,如果有兩張不同的網絡卡,可以將其進行分開,首先需要將網路設定為 hostNetworkhostNetwork 意味著容器網路和宿主機網路位於同一個網路型別,這個調整隻能在 rook 初始化叢集的時候做調整,配置位於 cluster.yaml 檔案

[root@m1 rbd]# cd ../../
[root@m1 ceph]# vim cluster.yaml
 71   network:
 72     # enable host networking
 73     #provider: host

調整故障域

Ceph 支援設定資源池的故障域,何為故障域?故障域是指當出現異常時能容忍的範圍, Ceph 支援多種不同型別的故障域,常⻅的故障域有:

  • datacenter :資料中心級別,如三個副本,分別落在三個不同的資料中心
  • rack :機架級別,如三個副本,分別落在三個不同的資料機櫃
  • host :宿主機級別,如三個副本,分別落在三個不同的宿主機,預設規則
  • osd :磁碟級別,如三個副本,分別落在三個不同的磁碟上

建立 pool 的時候可以定義 pool 所使用的故障域,如下建立一個 pool 所使用的故障域為 osd

#修改配置
[root@m1 ceph]# grep -v "[.*#]" pool.yaml 

apiVersion: ceph.rook.io/v1
kind: CephBlockPool
metadata:
  name: test-domain
  namespace: rook-ceph
spec:
  failureDomain: osd #故障域 
  replicated:
    size: 3
    requireSafeReplicaSize: true

建立之後,可以通過如下命令進行校驗

[root@m1 ceph]# kubectl apply -f pool.yaml 
cephblockpool.ceph.rook.io/test-domain created

# 檢視底層 pool 資訊
[root@m1 ceph]# ceph osd lspools
1 device_health_metrics
2 replicapool
3 myfs-metadata
4 myfs-data0
5 my-store.rgw.control
6 my-store.rgw.meta
7 my-store.rgw.log
8 my-store.rgw.buckets.index
9 my-store.rgw.buckets.non-ec
10 .rgw.root
11 my-store.rgw.buckets.data
12 evescn_test
13 testpool1
14 testpool2
16 pool2
17 test-domain

# 檢視 crush_rule 規則
[root@m1 ceph]# ceph osd pool get test-domain  crush_rule
crush_rule: test-domain

# 檢視故障域型別
[root@m1 ceph]# ceph osd crush rule dump test-domain
{
    "rule_id": 12,
    "rule_name": "test-domain",
    "ruleset": 12,
    "type": 1,
    "min_size": 1,
    "max_size": 10,
    "steps": [
        {
            "op": "take",
            "item": -1,
            "item_name": "default"
        },
        {
            "op": "choose_firstn",
            "num": 0,
            "type": "osd"     # 故障域型別
        },
        {
            "op": "emit"
        }
    ]
}

對於 osd 的故障域來說,其將資料分佈在三個不同的磁碟上,不管這三個磁碟是否落在同個宿主機上,因此存在有資料丟失的⻛險,基於各種因素需要調整故障域時,可以通過如下的方法進行調整,首先需要建立一個故障域關聯的規則

[root@m1 ceph]# ceph osd crush rule create-replicated happylau-rule default host

[root@m1 ceph]# ceph osd crush rule dump happylau-rule
{
    "rule_id": 13,
    "rule_name": "happylau-rule",
    "ruleset": 13,
    "type": 1,
    "min_size": 1,
    "max_size": 10,
    "steps": [
        {
            "op": "take",
            "item": -1,
            "item_name": "default"
        },
        {
            "op": "chooseleaf_firstn",
            "num": 0,
            "type": "host"    # 建立 happylau-rule 的規則,故障域為 host
        },
        {
            "op": "emit"
        }
    ]
}

建立 rule 規則之後,將其應用在對應的 pool

[root@m1 ceph]# ceph osd pool set test-domain crush_rule happylau-rule
set pool 17 crush_rule to happylau-rule
[root@m1 ceph]# ceph osd pool get test-domain crush_rule
crush_rule: happylau-rule