nvidia-smi mig -lgip
nvidia-smi mig -lgipp
kubectl label nodes $NODE nvidia.com/mig.config=all-1g.5gb
+-----------------------------------------------------------------------------+
| GPU instance profiles: |
| GPU Name ID Instances Memory P2P SM DEC ENC |
| Free/Total GiB CE JPEG OFA |
|=============================================================================|
| 0 MIG 1g.5gb 19 0/7 4.75 No 14 0 0 |
| 1 0 0 |
+-----------------------------------------------------------------------------+
| 0 MIG 1g.5gb+me 20 0/1 4.75 No 14 1 0 |
| 1 1 1 |
+-----------------------------------------------------------------------------+
| 0 MIG 1g.10gb 15 0/4 9.62 No 14 1 0 |
| 1 0 0 |
+-----------------------------------------------------------------------------+
| 0 MIG 2g.10gb 14 0/3 9.62 No 28 1 0 |
| 2 0 0 |
+-----------------------------------------------------------------------------+
| 0 MIG 3g.20gb 9 0/2 19.50 No 42 2 0 |
| 3 0 0 |
+-----------------------------------------------------------------------------+
| 0 MIG 4g.20gb 5 0/1 19.50 No 56 2 0 |
| 4 0 0 |
+-----------------------------------------------------------------------------+
| 0 MIG 7g.40gb 0 0/1 39.25 No 98 5 0 |
| 7 1 1 |
+-----------------------------------------------------------------------------+
kubectl label nodes $NODE nvidia.com/mig.config=all-3g.20gb --overwrite
kubectl label nodes $NODE nvidia.com/mig.config=all-disabled --overwrite
kao-PowerEdge-R7525 | kao-poweredge-r7525
kubectl label nodes kao-poweredge-r7525 nvidia.com/mig.config=all-3g.20gb --overwrite
kubectl get node -o json | jq '.items[].metadata.labels'
kubectl get all -n gpu-operator
kubectl label nodes kao-poweredge-r7525 nodePool=cluster
TO Deploy:
helm install --wait --generate-name \\
-n gpu-operator --create-namespace \\
nvidia/gpu-operator \\
--set driver.enabled=false \\
--set mig.strategy=mixed
Check
apiVersion: v1
kind: Pod
metadata:
name: nvcuda
spec:
restartPolicy: OnFailure
containers:
- image: nvidia/cuda:12.4.0-runtime-ubuntu22.04
name: nvcuda
imagePullPolicy: IfNotPresent
command: ["bash"]
args: ["-c", "sleep infinity"]
resources:
limits:
nvidia.com/mig-1g.5gb: 1
nodeSelector:
nvidia.com/gpu.product: NVIDIA-A100-PCIE-40GB
cat << EOF | kubectl create -f -
apiVersion: v1
kind: Pod
metadata:
name: cuda-vectoradd
spec:
restartPolicy: OnFailure
containers:
- name: vectoradd
image: nvidia/samples:vectoradd-cuda11.2.1
resources:
limits:
nvidia.com/gpu: 1
nodeSelector:
nodePool: cluster
nvidia.com/gpu.product: NVIDIA-A100-PCIE-40GB-MIG-3g.20gb
EOF
cat << EOF | kubectl create -f -
apiVersion: v1
kind: Pod
metadata:
name: cuda-vectoradd
spec:
restartPolicy: OnFailure
containers:
- name: vectoradd
image: nvidia/samples:vectoradd-cuda11.2.1
resources:
limits:
nvidia.com/mig-1g.5gb: 1
nodeSelector:
nvidia.com/gpu.product: NVIDIA-A100-PCIE-40GB
EOF
SAMPLE:
cuda-load-generator.yaml
apiVersion: v1
kind: Pod
metadata:
name: dcgmproftester
spec:
restartPolicy: OnFailure
containers:
- name: dcgmproftester11
image: nvidia/samples:dcgmproftester
args: ["--no-dcgm-validation", "-t 1004", "-d 120"]
resources:
limits:
nvidia.com/gpu: 1
securityContext:
capabilities:
add: ["SYS_ADMIN"]