본문 바로가기

컴퓨터/클라우드 (Cloud)

kubernetes, helm, gpu monitoring 명령어 정리

gpu operator helm repository

helm repo add nvidia https://helm.ngc.nvidia.com/nvidia \
   && helm repo update

 

helm install gpu operator

helm install --wait --generate-name   \
-n gpu-operator --create-namespace       nvidia/gpu-operator       --set driver.enabled=false

 

helm delete gpu operator

helm ls -n gpu-operator

helm delete -n gpu-operator gpu-operator-1675669830

 

helm custom gpu monitor

https://grafana.com/grafana/dashboards/14574-nvidia-gpu-metrics/

 

Nvidia GPU Metrics | Grafana Labs

Edit Delete Confirm Cancel

grafana.com

 

helm repo add utkuozdemir https://utkuozdemir.org/helm-charts

helm install my-release -n gpu-operator-custom --create-namespace \
--set service.type='NodePort'  utkuozdemir/nvidia-gpu-exporter

 

helm save prometheus value file

helm inspect values prometheus-community/kube-prometheus-stack > kube-prometheus-stack.values

 

helm install prometheus

helm install prometheus-community/kube-prometheus-stack    --create-namespace \
--namespace prometheus  --generate-name    --values kube-prometheus-stack.values

 

helm patch grafana service type as nodeport

kubectl patch svc \
kube-prometheus-stack-1675662898-grafana -n prometheus -p '{ "spec": { "type": "NodePort" } }'