kubeflow搭建

发布时间:2026/6/9 16:38:20

kubeflow搭建 目录一、架构说明二、Kubernetes搭建三、Volcano部署四、kubeflow环境搭建五、配置kubeflow用户名称空间可选部署model-registry一、架构说明本次环境部署方案采用 Kubernetes 集群架构结合 Volcano、GPU Operator 和 Kubeflow 组件。需要准备两台虚拟机单主 Master 节点单 Worker 节点用于运行集群组件完成基础环境部署后当需要接入带GPU资源的宿主机时只需进行初始化设置并安装containerd即可将其加入集群。二、Kubernetes搭建建议配置两台虚拟机分别作为Kubernetes集群的主节点和工作节点主节点配置要求仅需基础CPU和内存资源即可满足集群控制需求推荐通过二进制或包管理器安装containerd使用apt安装kubeadm和kubelet等核心组件工作节点配置要求需要分配充足的CPU和内存资源建议直接调用主机CPU资源以避免部分组件调用的cpu flag位不存在通用配置要求所有节点均需关闭防火墙配置时间同步服务禁用swap分区调整文件句柄为65536ulimit -n#Kubernetes集群v1.30.14#部署环境虚拟机系统--ubuntu2204containerd v2.1.0#部署预设系统环境#1使用chronyd服务同步节点 时间#2在本地hosts文件解析各节点#3禁用所有Swap设备#4禁用默认的iptables策略#5安装v1.30.14版本的kubeadm、kubelet和kubectl#6文件句柄配置为65536ulimit -n 结果为65536#禁用所有节点的swap并配置本地解析#192.168.100.100 kubeflow-node1.kubeflow.com kubeflow-node1 kubeapi.kubeflow.com kubeapi#192.168.100.101 kubeflow-node2.kubeflow.com#前期准备 #查询nerdctl官网全量包下载至本地完成containerd和nerdctl的二进制按照 rootkubeflow-node1:~# tar xzf /tmp/nerdctl-full-2.1.1-linux-amd64.tar.gz -C /usr/local/ rootkubeflow-node1:~# mkdir -p /etc/containerd/certs.d/ #containerd v2.1.0使用代理仓库必须与主配置文件分开 rootkubeflow-node1:~# cat /etc/containerd/config.toml version 2 root /var/lib/containerd state /run/containerd oom_score -999 [grpc] max_recv_message_size 16777216 max_send_message_size 16777216 [debug] level info [metrics] address grpc_histogram false [plugins] [plugins.io.containerd.grpc.v1.cri] sandbox_image registry.k8s.io/pause:3.9 max_container_log_line_size -1 [plugins.io.containerd.grpc.v1.cri.containerd] default_runtime_name runc snapshotter overlayfs [plugins.io.containerd.grpc.v1.cri.containerd.runtimes] [plugins.io.containerd.grpc.v1.cri.containerd.runtimes.runc] runtime_type io.containerd.runc.v2 runtime_engine runtime_root [plugins.io.containerd.grpc.v1.cri.containerd.runtimes.runc.options] SystemdCgroup true [plugins.io.containerd.grpc.v1.cri.registry] config_path /etc/containerd/certs.d rootkubeflow-node1:~# cd /etc/containerd/certs.d/ #containerd使用代理仓库拉取不同上游源时需要单独为每个源创建对应上游源名称的目录并在该目录下创建对应的hosts.toml文件同时在该文件内指明代理的上游源访问链接 rootkubeflow-node1:/etc/containerd/certs.d# mkdir {docker.io,quay.io,registry.custom.local:12480} rootkubeflow-node1:/etc/containerd/certs.d# cat docker.io/hosts.toml server https://docker.io [host.https://swr.cn-north-4.myhuaweicloud.com/ddn-k8s/register.k8s.io] capabilities [pull,resolve] skip_verify true rootkubeflow-node1:/etc/containerd/certs.d# cd ~ rootkubeflow-node1:~# vim /etc/hosts 192.168.100.100 kubeflow-node1.kubeflow.com kubeflow-node1 kubeapi.kubeflow.com kubeapi 192.168.100.101 kubeflow-node2.kubeflow.com kubeflow-node2 rootkubeflow-node1:~# vim /etc/default/kubelet KUBELET_EXTRA_ARGS--cgroup-driversystemd rootkubeflow-node1:~# vim /etc/sysctl.conf net.bridge.bridge-nf-call-iptables 1 net.bridge.bridge-nf-call-ip6tables 1 net.ipv4.ip_forward 1 rootkubeflow-node1:~# sysctl -p net.ipv4.ip_forward 1 net.bridge.bridge-nf-call-iptables 1 net.bridge.bridge-nf-call-ip6tables 1 rootkubeflow-node1:~# vim /etc/modules-load.d/k8s.conf overlay br_netfilter rootkubeflow-node1:~# modprobe overlay rootkubeflow-node1:~# modprobe br_netfilter rootkubeflow-node1:~# systemctl daemon-load rootkubeflow-node1:~# systemctl enable --now containerd rootkubeflow-node1:~# systemctl enable --now kubelet #部署Kubernetes集群 rootkubeflow-node1:~# kubeadm config images pull rootkubeflow-node1:~# kubeadm init --control-plane-endpointkubeapi.kubeflow.com --kubernetes-versionv1.30.14 --pod-network-cidr10.243.0.0/16 --service-cidr10.97.0.0/16 --token-ttl0 --upload-certs [init] Using Kubernetes version: v1.30.14 [preflight] Running pre-flight checks [preflight] Pulling images required for setting up a Kubernetes cluster ....... Your Kubernetes control-plane has initialized successfully! rootkubeflow-node1:~# mkdir -p $HOME/.kube rootkubeflow-node1:~# cp -i /etc/kubernetes/admin.conf $HOME/.kube/config rootkubeflow-node1:~# chown $(id -u):$(id -g) $HOME/.kube/config rootkubeflow-node1:~# echo source (kubectl completion bash) ~/.bashrc rootkubeflow-node1:~# mkdir addons rootkubeflow-node1:~# cd addons/ #在github下载如下项目的官方包 rootkubeflow-node1:~/addons# ls calico-3.28.4 helm-v3.16.0-linux-amd64.tar.gz ingress kuboard metallb-0.14.6 nfs-dirver rootkubeflow-node1:~/addons# kubectl apply -f calico-3.28.4/manifests/calico.yaml rootkubeflow-node1:~/addons# kubectl apply -f calico-3.28.4/manifests/calico-typha.yaml rootkubeflow-node1:~/addons# kubectl apply -f calico-3.28.4/manifests/calicoctl.yaml #待calico的pod运行后即可观察到master节点状态转为Ready此时可以到worker节点执行命令加入集群 rootkubeflow-node1:~/addons# kubectl get nodes NAME STATUS ROLES AGE VERSION kubeflow-node1 Ready control-plane 26h v1.30.14 #部署helm二进制包 rootkubeflow-node1:~/addons# tar xzvf helm-v3.16.0-linux-amd64.tar.gz rootkubeflow-node1:~/addons# mv linux-amd64/helm /usr/local/bin/ rootkubeflow-node1:~/addons# chmod x /usr/local/bin/helm rootkubeflow-node1:~/addons# helm version version.BuildInfo{Version:v3.16.0, GitCommit:0d439e1a09683f21a0ab9401eb661401f185b00b, GitTreeState:clean, GoVersion:go1.22.6} rootkubeflow-node1:~/addons# kubectl apply -f metallb-0.14.6/config/manifests/metallb-native.yaml rootkubeflow-node1:~/addons# kubectl apply -f ingress/deploy.yaml rootkubeflow-node1:~/addons/nfs-dirver# ls crd-csi-snapshot.yaml csi-nfs-driverinfo.yaml csi-snapshot-controller.yaml rbac-snapshot-controller.yaml csi-nfs-controller.yaml csi-nfs-node.yaml rbac-csi-nfs.yaml storageclass.yaml #此处需要将新建的SC设置为默认否则部署kubeflow时pvc会创建失败提示找不到SC rootkubeflow-node1:~/addons/nfs-dirver# vim storageclass.yaml --- apiVersion: storage.k8s.io/v1 kind: StorageClass metadata: name: nfs-csi annotations: storageclass.kubernetes.io/is-default-class: true provisioner: nfs.csi.k8s.io parameters: server: 192.168.100.200 share: / # csi.storage.k8s.io/provisioner-secret is only needed for providing mountOptions in DeleteVolume # csi.storage.k8s.io/provisioner-secret-name: mount-options # csi.storage.k8s.io/provisioner-secret-namespace: default reclaimPolicy: Delete volumeBindingMode: Immediate mountOptions: - nfsvers4.1 #server为nfs服务器IPshare为共享的目录 rootkubeflow-node1:~/addons/nfs-dirver# kubectl apply -f . #创建二层网络 rootkubeflow-node1:~/addons# vim eip-pool.yaml --- apiVersion: metallb.io/v1beta1 kind: IPAddressPool metadata: name: localip-pool namespace: metallb-system spec: addresses: - 192.168.100.150-192.168.100.155 autoAssign: true avoidBuggyIPs: true rootkubeflow-node1:~/addons# vim L2.yaml --- apiVersion: metallb.io/v1beta1 kind: L2Advertisement metadata: name: localip-pool-l2a namespace: metallb-system spec: ipAddressPools: - localip-pool interfaces: - eth0 rootkubeflow-node1:~/addons# kubectl apply -f ip/eip-pool.yaml ipaddresspool.metallb.io/localip-pool created rootkubeflow-node1:~/addons# kubectl apply -f ip/L2.yaml l2advertisement.metallb.io/localip-pool-l2a created rootkubeflow-node1:~/addons# vim kuboard/kuboard-v3-storage-class.yaml --- apiVersion: v1 kind: Namespace metadata: name: kuboard --- apiVersion: v1 kind: ConfigMap metadata: name: kuboard-config namespace: kuboard data: KUBOARD_ENDPOINT: http://kuboard.kubeflow.com KUBOARD_AGENT_KEY: 32b7d6572c6255211b4eec9009e4a816 --- apiVersion: v1 kind: ServiceAccount metadata: name: kuboard-admin namespace: kuboard --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: name: kuboard-admin roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: cluster-admin subjects: - kind: ServiceAccount name: kuboard-admin namespace: kuboard --- # 单节点 ETCD官方专用 apiVersion: apps/v1 kind: StatefulSet metadata: name: kuboard-etcd namespace: kuboard spec: replicas: 1 serviceName: kuboard-etcd selector: matchLabels: app: kuboard-etcd template: metadata: labels: app: kuboard-etcd spec: containers: - name: etcd image: quay.io/coreos/etcd:v3.4.16 command: - etcd - --data-dir/etcd-data - --namekuboard-etcd-0 - --initial-clusterkuboard-etcd-0http://kuboard-etcd-0.kuboard-etcd.kuboard.svc.cluster.local:2380 - --initial-cluster-statenew - --initial-advertise-peer-urlshttp://kuboard-etcd-0.kuboard-etcd.kuboard.svc.cluster.local:2380 - --listen-peer-urlshttp://0.0.0.0:2380 - --advertise-client-urlshttp://kuboard-etcd-0.kuboard-etcd.kuboard.svc.cluster.local:2379 - --listen-client-urlshttp://0.0.0.0:2379 ports: - name: client containerPort: 2379 - name: peer containerPort: 2380 volumeMounts: - name: etcd-data mountPath: /etcd-data volumeClaimTemplates: - metadata: name: etcd-data spec: accessModes: [ ReadWriteOnce ] resources: requests: storage: 1Gi --- apiVersion: v1 kind: Service metadata: name: kuboard-etcd namespace: kuboard spec: clusterIP: None selector: app: kuboard-etcd ports: - name: client port: 2379 - name: peer port: 2380 --- # Kuboard v3 单节点版本 apiVersion: apps/v1 kind: Deployment metadata: name: kuboard-v3 namespace: kuboard spec: replicas: 1 selector: matchLabels: app: kuboard-v3 template: metadata: labels: app: kuboard-v3 spec: containers: - name: kuboard image: eipwork/kuboard:v3 imagePullPolicy: IfNotPresent ports: - containerPort: 80 env: - name: KUBOARD_ETCD_ENDPOINTS value: kuboard-etcd-0.kuboard-etcd.kuboard.svc.cluster.local:2379 - name: KUBOARD_ENDPOINT value: http://kuboard.kubeflow.com - name: KUBOARD_AGENT_KEY value: 32b7d6572c6255211b4eec9009e4a816 --- apiVersion: v1 kind: Service metadata: name: kuboard-v3 namespace: kuboard spec: ports: - port: 80 targetPort: 80 selector: app: kuboard-v3 rootkubeflow-node1:~/addons# vim kuboard/ingress.yaml --- apiVersion: networking.k8s.io/v1 kind: Ingress metadata: name: kuboard-v3 namespace: kuboard spec: ingressClassName: nginx rules: - host: kuboard.kubeflow.com http: paths: - path: / backend: service: name: kuboard-v3 port: number: 80 pathType: Prefix rootkubeflow-node1:~/addons# kubectl apply -f kuboard/在worker节点上执行和master节点相同的命令进行前期准备需要注意由于初始环境未加入含有GPU资源的节点所以单个worker节点会承载基本所有的服务pod该节点的cpu需要配置在12C以上否则后续pod会有pod无法启动。#加入Kubernetes集群 rootkubeflow-node2:~# kubeadm join kubeapi.kubeflow.com:6443 --token s0nzbq.tspjl2ol3pdkr7f3 --discovery-token-ca-cert-hash sha256:de8847cff36931235b8146cddc11e11f26f717eaf7d8f94118b56031c1361fad三、Volcano部署Volcano可以使Kubernetes集群具备GPU调度、队列和优先级的功能。#只需以下一条命令即可完成volcano的部署 rootkubeflow-node1:~/addons# kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/master/installer/volcano-development.yaml #kubeflow的部署建议使用kustomize的方式进行因此本次安装v5.0.1版本进行适配 rootkubeflow-node1:~/addons# tar xzvf kustomize_v5.0.1_linux_amd64.tar.gz kustomize rootkubeflow-node1:~/addons# chmod x kustomize rootkubeflow-node1:~/addons# mv kustomize /usr/local/bin/ rootkubeflow-node1:~/addons# kustomize version v5.0.1 #下载github内的metacontroller进行部署若要部署kubeflow需要该项目的crds资源必须在部署pipeline前完成该组件安装 rootkubeflow-node1:~/addons# ls metacontroller/ kustomization.yaml metacontroller-crds-v1.yaml metacontroller-namespace.yaml metacontroller-rbac.yaml metacontroller.yaml rootkubeflow-node1:~/addons# kustomize build metacontroller/ | kubectl apply -f - #手动创建image CRDs资源补全该组件才能正常安装Knative rootkubeflow-node1:~/addons# vim kubeflow/caching-image-crd.yaml apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: name: images.caching.internal.knative.dev labels: knative.dev/crd-install: true serving.knative.dev/release: v1.11.0 spec: group: caching.internal.knative.dev names: kind: Image plural: images singular: image categories: - all - knative-internal - caching scope: Namespaced versions: - name: v1alpha1 served: true storage: true schema: openAPIV3Schema: type: object properties: apiVersion: type: string kind: type: string metadata: type: object spec: type: object properties: image: type: string serviceAccountName: type: string status: type: object properties: cacheReady: type: boolean digest: type: string observedGeneration: type: integer format: int64 conditions: type: array items: type: object properties: type: type: string status: type: string reason: type: string message: type: string lastTransitionTime: type: string format: date-time conversion: strategy: None rootkubeflow-node1:~/addons# kubectl apply -f kubeflow/caching-image-crd.yaml customresourcedefinition.apiextensions.k8s.io/images.caching.internal.knative.dev created四、kubeflow环境搭建Kubeflow 是谷歌开源的一个 机器学习ML工作流平台旨在让在 Kubernetes 上部署、管理和扩展机器学习全流程变得更加简单、可移植和可扩展。OIDC AuthService 用户认证 / 登录鉴权Profiles 多租户Notebooks 提供开发环境Katib 超参调优Pipelines 机器学习流水线Cert-Manager 证书管理#将github上kubeflow的git包缓存至本地,本次使用项目版本为v1.9.0 rootkubeflow-node1:~/addons# cd kubeflow/manifests #使用kustomize部署证书服务 rootkubeflow-node1:~/addons/kubeflow/manifests# kustomize build common/cert-manager/cert-manager/base | kubectl apply -f - #部署istio服务 rootkubeflow-node1:~/addons/kubeflow/manifests# kustomize build common/istio-1-22/istio-crds/base | kubectl apply -f - rootkubeflow-node1:~/addons/kubeflow/manifests# kustomize build common/istio-1-22/istio-namespace/base | kubectl apply -f - namespace/istio-system created rootkubeflow-node1:~/addons/kubeflow/manifests# kustomize build common/istio-1-22/istio-install/overlays/oauth2-proxy | kubectl apply -f - #部署Oauth2-proxy服务 rootkubeflow-node1:~/addons/kubeflow/manifests# kustomize build common/oidc-client/oauth2-proxy/overlays/m2m-self-signed/ | kubectl apply -f - #可选修改登录使用的默认邮箱 rootkubeflow-node1:~/addons/kubeflow/manifests# cd common/dex/overlays/oauth2-proxy/ rootkubeflow-node1:~/addons/kubeflow/manifests/common/dex/overlays/oauth2-proxy# vim config-map.yaml ...... staticPasswords: - email: adminkubeflow.com hashFromEnv: DEX_USER_PASSWORD username: admin ...... rootkubeflow-node1:~/addons/kubeflow/manifests/common/dex/overlays/oauth2-proxy# cd - /root/addons/kubeflow/manifests #部署dex服务用于账号登录 rootkubeflow-node1:~/addons/kubeflow/manifests# kustomize build common/dex/overlays/oauth2-proxy | kubectl apply -f - #部署Knative服务 rootkubeflow-node1:~/addons/kubeflow/manifests# kustomize build common/knative/knative-serving/overlays/gateways | kubectl apply -f - rootkubeflow-node1:~/addons/kubeflow/manifests# kustomize build common/istio-1-22/cluster-local-gateway/base | kubectl apply -f - #创建kubeflow部署位置的名称空间 rootkubeflow-node1:~/addons/kubeflow/manifests# kustomize build common/kubeflow-namespace/base | kubectl apply -f - namespace/kubeflow created #创建kubeflow部署所需要的角色 rootkubeflow-node1:~/addons/kubeflow/manifests# kustomize build common/kubeflow-roles/base | kubectl apply -f - #创建Profiles KFAM服务 rootkubeflow-node1:~/addons/kubeflow/manifests# kustomize build apps/profiles/upstream/overlays/kubeflow | kubectl apply -f - #可选操作可修改profile开启宿主机网络高优先级 ootkubeflow-node1:~/addons/kubeflow/manifests# kubeclt -n kubeflow edit deployment profiles-deployment ...... dnsPolicy: ClusterFirstWithHostNet hostNetwork: true restartPolicy: Always ...... #部署admission-webhook服务 rootkubeflow-node1:~/addons/kubeflow/manifests# kustomize build apps/admission-webhook/upstream/overlays/cert-manager | kubectl apply -f - #部署PVC Viewer Controller便于查询pvc内的文件 rootkubeflow-node1:~/addons/kubeflow/manifests# kustomize build apps/pvcviewer-controller/upstream/default | kubectl apply -f - #部署Volumes Web Application便于管理存储卷 rootkubeflow-node1:~/addons/kubeflow/manifests# kustomize build apps/volumes-web-app/upstream/overlays/istio | kubectl apply -f - #部署Kserver服务 rootkubeflow-node1:~/addons/kubeflow/manifests# kustomize build contrib/kserve/kserve | kubectl apply -f - #初次创建资源较慢需要等待30秒后重新执行 rootkubeflow-node1:~/addons/kubeflow/manifests# kustomize build contrib/kserve/kserve | kubectl apply -f - #30秒后确认资源创建成功并生效 rootkubeflow-node1:~/addons/kubeflow/manifests# echo $? 0 rootkubeflow-node1:~/addons/kubeflow/manifests# kustomize build contrib/kserve/models-web-app/overlays/kubeflow | kubectl apply -f - #部署Katib服务 rootkubeflow-node1:~/addons/kubeflow/manifests# kustomize build apps/katib/upstream/installs/katib-with-kubeflow | kubectl apply -f - #部署pipeline服务 rootkubeflow-node1:~/addons/kubeflow/manifests# kustomize build apps/pipeline/upstream/env/cert-manager/platform-agnostic-multi-user | kubectl apply -f - #部署centraldashboard展示板服务 rootkubeflow-node1:~/addons/kubeflow/manifests# kustomize build apps/centraldashboard/upstream/overlays/kserve | kubectl apply -f - #部署notebooks服务 rootkubeflow-node1:~/addons/kubeflow/manifests# kustomize build apps/jupyter/notebook-controller/upstream/overlays/kubeflow | kubectl apply -f - rootkubeflow-node1:~/addons/kubeflow/manifests# kustomize build apps/jupyter/jupyter-web-app/upstream/overlays/istio | kubectl apply -f - #部署Tensorboard服务 rootkubeflow-node1:~/addons/kubeflow/manifests# kustomize build apps/tensorboard/tensorboards-web-app/upstream/overlays/istio | kubectl apply -f - rootkubeflow-node1:~/addons/kubeflow/manifests# kustomize build apps/tensorboard/tensorboard-controller/upstream/overlays/kubeflow | kubectl apply -f - #部署Training Operator rootkubeflow-node1:~/addons/kubeflow/manifests# kustomize build apps/training-operator/upstream/overlays/kubeflow | kubectl apply -f - #以上模块顺利部署后即完成kubeflow的部署 #若容器镜像为离线部署时需注意部分deployment的imagePullPolicy为Always需要手动修改yaml文件或使用命令修改为IfNotPresent才能正常启动如下 rootkubeflow-node1:~/addons# kubectl -n kubeflow edit deployments.apps cache-server ...... image: gcr.io/ml-pipeline/cache-server:2.2.0 imagePullPolicy: IfNotPresent ......五、配置kubeflow用户名称空间#为istio服务创建外部EXTERNAL IP便于访问 rootkubeflow-node1:~# kubectl get svc istio-ingressgateway -n istio-system NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE istio-ingressgateway ClusterIP 10.97.45.249 none 15021/TCP,80/TCP,443/TCP 2d15h rootkubeflow-node1:~# kubectl patch svc istio-ingressgateway -n istio-system -p {spec:{type:LoadBalancer}} service/istio-ingressgateway patched rootkubeflow-node1:~# kubectl get svc istio-ingressgateway -n istio-system NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE istio-ingressgateway LoadBalancer 10.97.45.249 192.168.100.151 15021:32008/TCP,80:31585/TCP,443:31367/TCP 2d17h #由于多数服务为https故需要创建证书 rootkubeflow-node1:~# cd addons/kubeflow/ rootkubeflow-node1:~/addons/kubeflow# mkdir certs cd certs #生成自签名证书 rootkubeflow-node1:~/addons/kubeflow/certs# openssl req -x509 -sha256 -nodes -days 3650 -newkey rsa:2048 \ -keyout kubeflow.key \ -out kubeflow.crt \ -subj /CNkubeflow.example.com/Okubeflow rootkubeflow-node1:~/addons/kubeflow/certs# ls kubeflow.crt kubeflow.key rootkubeflow-node1:~/addons/kubeflow/certs# kubectl create -n istio-system secret tls kubeflow-tls \ --keykubeflow.key \ --certkubeflow.crt secret/kubeflow-tls created rootkubeflow-node1:~/addons/kubeflow/certs# vim https.yaml apiVersion: networking.istio.io/v1alpha3 kind: Gateway metadata: name: kubeflow-gateway namespace: kubeflow spec: selector: istio: ingressgateway servers: - port: number: 80 name: http protocol: HTTP tls: httpsRedirect: true hosts: - * - port: number: 443 name: https protocol: HTTPS tls: mode: SIMPLE credentialName: kubeflow-tls hosts: - * rootkubeflow-node1:~/addons/kubeflow/certs# kubectl apply -f https.yaml gateway.networking.istio.io/kubeflow-gateway created rootkubeflow-node1:~/addons/kubeflow/manifests# kustomize build common/user-namespace/base | kubectl apply -f -在web端输入https://192.168.100.151即可访问kubeflow dashboard页面,账号名默认未userexample.com若按照以上操作修改默认登录邮箱后请使用adminkubeflow.com密码未12341234部署完成用户名称空间后才能在管理界面创建Notebooks等资源可选部署model-registryrootkubeflow-node1:~# git clone --depth 1 -b v0.3.7 https://github.com/kubeflow/model-registry.git rootkubeflow-node1:~# cd model-registry/manifests/kustomize #输出用户名称空间等参数 rootkubeflow-node1:~/model-registry/manifests/kustomize# PROFILE_NAMEkubeflow-user-example-com rootkubeflow-node1:~/model-registry/manifests/kustomize# for DIR in options/istio overlays/db ; do (cd $DIR; kustomize edit set namespace $PROFILE_NAME); done #部署models-registry服务 rootkubeflow-node1:~/model-registry/manifests/kustomize# kubectl apply -k overlays/db rootkubeflow-node1:~/model-registry/manifests/kustomize# kubectl apply -k options/istio rootkubeflow-node1:~/model-registry/manifests/kustomize# kubectl apply -k options/ui/overlays/istio #修改central dashboard的界面添加models服务 rootkubeflow-node1:~/model-registry/manifests/kustomize# kubectl get configmap centraldashboard-config -n kubeflow -o json | jq .data.links | (fromjson | .menuLinks [{icon: assignment, link: /model-registry/, text: Model Registry, type: item}] | tojson) | kubectl apply -f - -n kubeflow #也可直接修改cm文件实现 rootkubeflow-node1:~/model-registry/manifests/kustomize# kubectl edit configmap -n kubeflow centraldashboard-config apiVersion: v1 data: links: |- { menuLinks: [ { icon: assignment, link: /model-registry/, text: Model Registry, type: item }, ... #建议加上下面参数避免默认不信任自签名证书 kubectl patch deployment model-registry-ui -n kubeflow \ -p {spec:{template:{spec:{containers:[{name:model-registry-ui,env:[{name:NODE_TLS_REJECT_UNAUTHORIZED,value:0}]}]}}}}六、部署NVIDIA GPU OperatorNVIDIA GPU Operator是一个Kubernetes Operator它自动化了在 Kubernetes 集群中管理 GPU 基础设施所需的所有组件的部署、配置和生命周期管理。它的核心理念是将 GPU 作为 Kubernetes 中的一等公民像管理 CPU 资源一样轻松管理 GPU。部署后只需声明需要几个GPU或GPU分片资源Operator会自动安装驱动、配置容器运行时部署Device Plugin并设置监控。由于本环境已安装helm v3可以使用helm源安装相关服务。#添加nvidia gpu operator的源 rootkubeflow-node1:~# helm repo add nvidia https://helm.ngc.nvidia.com/nvidia #可选添加longhorn的源可部署longhorn分布式块存储 rootkubeflow-node1:~# helm repo add longhorn https://charts.longhorn.io #可选可部署localai服务统一下载管理模型提供api rootkubeflow-node1:~# helm repo add go-skynet https://go-skynet.github.io/helm-charts/ rootkubeflow-node1:~# helm repo update rootkubeflow-node1:~# helm install gpu-operator nvidia/gpu-operator \ --namespace gpu-operator \ --create-namespace \ --set driver.enabledtrue #可选部署longhorn服务 rootkubeflow-node1:~# kubectl create ns longhorn rootkubeflow-node1:~# helm install longhorn longhorn/longhorn -n longhorn-system \ --set persistence.defaultClassfalse \ #该选项定义是否作为默认SC --set defaultSettings.defaultReplicaCount2 #该选项定义分布式存储的副本数 #可选部署localai服务 rootkubeflow-node1:~# helm show values go-skynet/local-ai values.yaml rootkubeflow-node1:~# vim values.yaml .... env: threads: 4 context_size: 512 HF_ENDPOINT: https://hf-mirror.com HUGGINGFACE_BASE_URL: https://hf-mirror.com LLAMACPP_HF_MIRROR: https://hf-mirror.com HUGGINGFACE_HUB_CACHE: /models HF_HOME: /models HTTP_PROXY: http://squid IP:3128 HTTPS_PROXY: http://squid IP:3128 .... rootkubeflow-node1:~# kubectl create ns localai rootkubeflow-node1:~# helm install local-ai go-skynet/local-ai -f values.yaml -n localai以下为本次项目使用的github链接https://github.com/kubernetes-sigs/kustomize/https://github.com/kubernetes-sigs/kustomize/https://github.com/metacontroller/metacontroller/releaseshttps://github.com/metacontroller/metacontroller/releaseshttps://github.com/volcano-sh/volcanohttps://github.com/volcano-sh/volcanohttps://github.com/kubeflow/manifestshttps://github.com/kubeflow/manifests

相关新闻