kubernetes叢集使用GPU及安裝kubeflow1.0.RC操作步驟
kubernetes叢集使用GPU及安裝kubeflow1.0.RC操作步驟
安裝顯示卡驅動
sudo yum-config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
sudo yum clean all
sudo yum -y install nvidia-driver-latest-dkms cuda
sudo yum -y install cuda-drivers
如缺少gcc依賴,則實行如下命令
yum install kernel-devel kernel-doc kernel-headers gcc\* glibc\* glibc-\*
rpm --import https://www.elrepo.org/RPM-GPG-KEY-elrepo.org
rpm -Uvh http://www.elrepo.org/elrepo-release-7.0-3.el7.elrepo.noarch.rpm
yum install -y kmod-nvidia
###在GRUB_CMDLINE_LINUX新增 rdblacklist=nouveau 項 echo -e "blacklist nouveau\noptions nouveau modeset=0" > /etc/modprobe.d/blacklist.conf
重啟,檢視nouveau是否被禁用成功
lsmod|grep nouv
沒有任何輸出,則表示nouveau已被禁用
檢視伺服器顯示卡資訊
[[email protected] ~]# nvidia-smi Tue Jan 14 03:46:41 2020 +-----------------------------------------------------------------------------+ | NVIDIA-SMI 440.44 Driver Version: 440.44 CUDA Version: 10.2 | |-------------------------------+----------------------+----------------------+ | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | |===============================+======================+======================| | 0 Tesla T4 Off | 00000000:18:00.0 Off | 0 | | N/A 29C P8 10W / 70W | 0MiB / 15109MiB | 0% Default | +-------------------------------+----------------------+----------------------+ | 1 Tesla T4 Off | 00000000:86:00.0 Off | 0 | | N/A 25C P8 9W / 70W | 0MiB / 15109MiB | 0% Default | +-------------------------------+----------------------+----------------------+ +-----------------------------------------------------------------------------+ | Processes: GPU Memory | | GPU PID Type Process name Usage | |=============================================================================| | No running processes found | +-----------------------------------------------------------------------------+
安裝NVIDIA-DOCKER
curl -s -L https://nvidia.github.io/nvidia-docker/centos7/x86_64/nvidia-docker.repo | sudo tee /etc/yum.repos.d/nvidia-docker.repo
- 查詢NVIDIAdocker版本
yum search --showduplicates nvidia-docker
- 安裝NVIDIA-docker
docker版本為docker18.09.7.ce,所以安裝下述NVIDIAdocker版本
yum install -y nvidia-docker2
pkill -SIGHUP dockerd
nvidia-docker version 可檢視已安裝的nvidia docker版本
修改docker runtimes為nvidia-docker
[[email protected] ~]# cat /etc/docker/daemon.json
{
"default-runtime": "nvidia",
"runtimes": {
"nvidia": {
"path": "nvidia-container-runtime",
"runtimeArgs": []
}
},
"registry-mirrors": ["https://o96k4rm0.mirror.aliyuncs.com"]
}
重啟docker及k8s
systemctl daemon-reload
systemctl restart docker.service
systemctl restart kubelet
cd /etc/kubernetes/
curl -O https://raw.githubusercontent.com/AliyunContainerService/gpushare-scheduler-extender/master/config/scheduler-policy-config.json
cd /tmp/
curl -O https://raw.githubusercontent.com/AliyunContainerService/gpushare-scheduler-extender/master/config/gpushare-schd-extender.yaml
kubectl create -f gpushare-schd-extender.yaml
kubectl create -f device-plugin-rbac.yaml
# rbac.yaml
---
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: gpushare-device-plugin
rules:
- apiGroups:
- ""
resources:
- nodes
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- events
verbs:
- create
- patch
- apiGroups:
- ""
resources:
- pods
verbs:
- update
- patch
- get
- list
- watch
- apiGroups:
- ""
resources:
- nodes/status
verbs:
- patch
- update
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: gpushare-device-plugin
namespace: kube-system
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: gpushare-device-plugin
namespace: kube-system
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: gpushare-device-plugin
subjects:
- kind: ServiceAccount
name: gpushare-device-plugin
namespace: kube-system
kubectl create -f device-plugin-ds.yaml
apiVersion: extensions/v1beta1
kind: DaemonSet
metadata:
name: gpushare-device-plugin-ds
namespace: kube-system
spec:
template:
metadata:
annotations:
scheduler.alpha.kubernetes.io/critical-pod: ""
labels:
component: gpushare-device-plugin
app: gpushare
name: gpushare-device-plugin-ds
spec:
serviceAccount: gpushare-device-plugin
hostNetwork: true
nodeSelector:
gpushare: "true"
containers:
- image: registry.cn-hangzhou.aliyuncs.com/acs/k8s-gpushare-plugin:v2-1.11-35eccab
name: gpushare
# Make this pod as Guaranteed pod which will never be evicted because of node's resource consumption.
command:
- gpushare-device-plugin-v2
- -logtostderr
- --v=5
#- --memory-unit=Mi
resources:
limits:
memory: "300Mi"
cpu: "1"
requests:
memory: "300Mi"
cpu: "1"
env:
- name: KUBECONFIG
value: /etc/kubernetes/kubelet.conf
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
參考
https://github.com/AliyunContainerService/gpushare-scheduler-extender
https://github.com/AliyunContainerService/gpushare-device-plugin
為共享節點打上gpushare標籤
kubectl label node mynode gpushare=true
安裝擴充套件
curl -LO https://storage.googleapis.com/kubernetes-release/release/v1.12.1/bin/linux/amd64/kubectl
chmod +x ./kubectl
sudo mv ./kubectl /usr/bin/kubectl
cd /usr/bin/
wget https://github.com/AliyunContainerService/gpushare-device-plugin/releases/download/v0.3.0/kubectl-inspect-gpushare
chmod u+x /usr/bin/kubectl-inspect-gpushare
kubectl inspect gpushare ##檢視叢集GPU使用情況
wget https://raw.githubusercontent.com/google/metallb/v0.7.3/manifests/metallb.yaml
kubectl apply -f metallb.yaml
metallb-config.yaml
apiVersion: v1
kind: ConfigMap
metadata:
namespace: metallb-system
name: config
data:
config: |
address-pools:
- name: default
protocol: layer2
addresses:
- 10.18.5.30-10.18.5.50
kubectl apply -f metallb-config.yaml
kubectl apply -f tensorflow.yaml
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
name: tensorflow-gpu
spec:
replicas: 1
template:
metadata:
labels:
name: tensorflow-gpu
spec:
containers:
- name: tensorflow-gpu
image: tensorflow/tensorflow:1.15.0-py3-jupyter
imagePullPolicy: Never
resources:
limits:
aliyun.com/gpu-mem: 1024
ports:
- containerPort: 8888
---
apiVersion: v1
kind: Service
metadata:
name: tensorflow-gpu
spec:
ports:
- port: 8888
targetPort: 8888
nodePort: 30888
name: jupyter
selector:
name: tensorflow-gpu
type: NodePort
檢視叢集GPU使用情況
[[email protected] ~]# kubectl inspect gpushare
NAME IPADDRESS GPU0(Allocated/Total) GPU1(Allocated/Total) GPU Memory(MiB)
master 10.18.5.20 1024/15109 0/15109 1024/30218
node 10.18.5.21 0/15109 0/15109 0/30218
------------------------------------------------------------------
Allocated/Total GPU Memory In Cluster:
1024/60436 (1%)
[[email protected] ~]#
可通過動態伸縮tensorflow service 的節點數量以及修改單個節點的視訊記憶體大小測試GPU使用情況
kubectl scale --current-replicas=1 --replicas=100 deployment/tensorflow-gpu
經測試,得出以下測試結果:
環境
節點 | GPU個數 | GPU記憶體 |
---|---|---|
master | 2 | 15109M*2=30218M |
node | 2 | 15109M*2=30218M |
測試結果
podGpu | pod個數 | gpu利用率 |
---|---|---|
256M | 183 | 77% |
512M | 116 | 98% |
1024M | 56 | 94% |
安裝kubeflow(V1.0.RC)
tar -vxf ks_0.12.0_linux_amd64.tar.gz
cp ks_0.12.0_linux_amd64/* /usr/local/bin/
kfctl_v1.0-rc.3-1-g24b60e8_linux.tar.gz
tar -zxvf kfctl_v1.0-rc.3-1-g24b60e8_linux.tar.gz
cp kfctl /usr/bin/
準備工作,建立PV及PVC,使用NFS作為檔案儲存
建立storageclass
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: local-path
namespace: kubeflow
#provisioner: example.com/nfs
provisioner: kubernetes.io/gce-pd
parameters:
type: pd-ssd
kubectl create -f storage.yml
yum install nfs-utils rpcbind
#建立NFS掛載目錄(至少需要四個)
mkdir -p /data/nfs
vim /etc/exports
#新增上面的掛載目錄
/data/nfs 192.168.122.0/24(rw,sync)
systemctl restart nfs-server.service
建立PV,因多個pod掛載檔案可能重名,所以最好建立多個PV由pod選擇掛載(至少4個,分別供katib-mysql,metadata-mysql,minio,mysql掛載)
[[email protected] pv]# cat mysql-pv.yml
apiVersion: v1
kind: PersistentVolume
metadata:
name: local-path #不同的PVC需要修改
spec:
capacity:
storage: 200Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Recycle
storageClassName: local-path
nfs:
path: /data/nfs #不同的PVC需要修改
server: 10.18.5.20
建立名稱空間 kubeflow-anonymous
kubectl create namespace kubeflow-anonymous
下載kubeflow1.0.RC yml檔案, https://github.com/kubeflow/manifests/blob/v1.0-branch/kfdef/kfctl_k8s_istio.yaml
[[email protected] 2020-0219]# cat kfctl_k8s_istio.yaml
apiVersion: kfdef.apps.kubeflow.org/v1
kind: KfDef
metadata:
clusterName: kubernetes
creationTimestamp: null
name: 2020-0219
namespace: kubeflow
spec:
applications:
- kustomizeConfig:
parameters:
- name: namespace
value: istio-system
repoRef:
name: manifests
path: istio/istio-crds
name: istio-crds
- kustomizeConfig:
parameters:
- name: namespace
value: istio-system
repoRef:
name: manifests
path: istio/istio-install
name: istio-install
- kustomizeConfig:
parameters:
- name: namespace
value: istio-system
repoRef:
name: manifests
path: istio/cluster-local-gateway
name: cluster-local-gateway
- kustomizeConfig:
parameters:
- name: clusterRbacConfig
value: "OFF"
repoRef:
name: manifests
path: istio/istio
name: istio
- kustomizeConfig:
parameters:
- name: namespace
value: istio-system
repoRef:
name: manifests
path: istio/add-anonymous-user-filter
name: add-anonymous-user-filter
- kustomizeConfig:
repoRef:
name: manifests
path: application/application-crds
name: application-crds
- kustomizeConfig:
overlays:
- application
repoRef:
name: manifests
path: application/application
name: application
- kustomizeConfig:
parameters:
- name: namespace
value: cert-manager
repoRef:
name: manifests
path: cert-manager/cert-manager-crds
name: cert-manager-crds
- kustomizeConfig:
parameters:
- name: namespace
value: kube-system
repoRef:
name: manifests
path: cert-manager/cert-manager-kube-system-resources
name: cert-manager-kube-system-resources
- kustomizeConfig:
overlays:
- self-signed
- application
parameters:
- name: namespace
value: cert-manager
repoRef:
name: manifests
path: cert-manager/cert-manager
name: cert-manager
- kustomizeConfig:
repoRef:
name: manifests
path: metacontroller
name: metacontroller
- kustomizeConfig:
overlays:
- istio
- application
repoRef:
name: manifests
path: argo
name: argo
- kustomizeConfig:
repoRef:
name: manifests
path: kubeflow-roles
name: kubeflow-roles
- kustomizeConfig:
overlays:
- istio
- application
repoRef:
name: manifests
path: common/centraldashboard
name: centraldashboard
- kustomizeConfig:
overlays:
- application
repoRef:
name: manifests
path: admission-webhook/bootstrap
name: bootstrap
- kustomizeConfig:
overlays:
- application
repoRef:
name: manifests
path: admission-webhook/webhook
name: webhook
- kustomizeConfig:
overlays:
- istio
- application
parameters:
- name: userid-header
value: kubeflow-userid
repoRef:
name: manifests
path: jupyter/jupyter-web-app
name: jupyter-web-app
- kustomizeConfig:
overlays:
- application
repoRef:
name: manifests
path: spark/spark-operator
name: spark-operator
- kustomizeConfig:
overlays:
- istio
- application
- db
repoRef:
name: manifests
path: metadata
name: metadata
- kustomizeConfig:
overlays:
- istio
- application
repoRef:
name: manifests
path: jupyter/notebook-controller
name: notebook-controller
- kustomizeConfig:
overlays:
- application
repoRef:
name: manifests
path: pytorch-job/pytorch-job-crds
name: pytorch-job-crds
- kustomizeConfig:
overlays:
- application
repoRef:
name: manifests
path: pytorch-job/pytorch-operator
name: pytorch-operator
- kustomizeConfig:
overlays:
- application
parameters:
- name: usageId
value: <randomly-generated-id>
- name: reportUsage
value: "true"
repoRef:
name: manifests
path: common/spartakus
name: spartakus
- kustomizeConfig:
overlays:
- istio
repoRef:
name: manifests
path: tensorboard
name: tensorboard
- kustomizeConfig:
overlays:
- application
repoRef:
name: manifests
path: tf-training/tf-job-crds
name: tf-job-crds
- kustomizeConfig:
overlays:
- application
repoRef:
name: manifests
path: tf-training/tf-job-operator
name: tf-job-operator
- kustomizeConfig:
overlays:
- application
repoRef:
name: manifests
path: katib/katib-crds
name: katib-crds
- kustomizeConfig:
overlays:
- application
- istio
repoRef:
name: manifests
path: katib/katib-controller
name: katib-controller
- kustomizeConfig:
overlays:
- application
repoRef:
name: manifests
path: pipeline/api-service
name: api-service
- kustomizeConfig:
overlays:
- application
parameters:
- name: minioPvcName
value: minio-pv-claim
repoRef:
name: manifests
path: pipeline/minio
name: minio
- kustomizeConfig:
overlays:
- application
parameters:
- name: mysqlPvcName
value: mysql-pv-claim
repoRef:
name: manifests
path: pipeline/mysql
name: mysql
- kustomizeConfig:
overlays:
- application
repoRef:
name: manifests
path: pipeline/persistent-agent
name: persistent-agent
- kustomizeConfig:
overlays:
- application
repoRef:
name: manifests
path: pipeline/pipelines-runner
name: pipelines-runner
- kustomizeConfig:
overlays:
- istio
- application
repoRef:
name: manifests
path: pipeline/pipelines-ui
name: pipelines-ui
- kustomizeConfig:
overlays:
- application
repoRef:
name: manifests
path: pipeline/pipelines-viewer
name: pipelines-viewer
- kustomizeConfig:
overlays:
- application
repoRef:
name: manifests
path: pipeline/scheduledworkflow
name: scheduledworkflow
- kustomizeConfig:
overlays:
- application
repoRef:
name: manifests
path: pipeline/pipeline-visualization-service
name: pipeline-visualization-service
- kustomizeConfig:
overlays:
- application
- istio
parameters:
- name: admin
value: [email protected]
repoRef:
name: manifests
path: profiles
name: profiles
- kustomizeConfig:
overlays:
- application
repoRef:
name: manifests
path: seldon/seldon-core-operator
name: seldon-core-operator
- kustomizeConfig:
overlays:
- application
parameters:
- name: namespace
value: knative-serving
repoRef:
name: manifests
path: knative/knative-serving-crds
name: knative-crds
- kustomizeConfig:
overlays:
- application
parameters:
- name: namespace
value: knative-serving
repoRef:
name: manifests
path: knative/knative-serving-install
name: knative-install
- kustomizeConfig:
overlays:
- application
repoRef:
name: manifests
path: kfserving/kfserving-crds
name: kfserving-crds
- kustomizeConfig:
overlays:
- application
repoRef:
name: manifests
path: kfserving/kfserving-install
name: kfserving-install
repos:
- name: manifests
uri: https://github.com/kubeflow/manifests/archive/master.tar.gz
version: master
status:
reposCache:
- localPath: '"../.cache/manifests/manifests-master"'
name: manifests
[[email protected] 2020-0219]#
#進入你的kubeflowapp目錄 執行
kfctl apply -V -f kfctl_k8s_istio.yaml
#安裝過程中需要從GitHub下載配置檔案,可能會失敗,失敗時重試
在kubeflowapp平級目錄下會生成kustomize資料夾,為防重啟時映象拉取失敗,需修改所有映象拉取策略為IfNotPresent
然後再次執行 kfctl apply -V -f kfctl_k8s_istio.yaml
檢視執行狀態
kubectl get all -n kubeflow
#修改ingeress-gateway訪問方式為LoadBalancer
kubectl -n istio-system edit svc istio-ingressgateway
#修改此處為LoadBalancer
selector:
app: istio-ingressgateway
istio: ingressgateway
release: istio
sessionAffinity: None
type: LoadBalancer
儲存,再次檢視該svc資訊
[[email protected] 2020-0219]# kubectl -n istio-system get svc istio-ingressgateway
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
istio-ingressgateway LoadBalancer 10.98.19.247 10.18.5.30 15020:32230/TCP,80:31380/TCP,443:31390/TCP,31400:31400/TCP,15029:31908/TCP,15030:31864/TCP,15031:31315/TCP,15032:30372/TCP,15443:32631/TCP 42h
[[email protected] 2020-0219]#
EXTERNAL-IP 即為外部訪問地址,訪問http://10.18.5.30 即可進入kubeflow主頁
關於映象拉取,gcr映象國內無法拉取,可以通過如下方式拉取
curl -s https://zhangguanzhang.github.io/bash/pull.sh | bash -s -- 映象資訊
若上述方法也無法拉取,可以使用阿里雲手動構建映象方式使用海外伺服器構建