Vadim Markovtsev, Athenian & Fragile Tech.
Vadim Markovtsev
Head of Analytics and ML at Athenian
Founder at Fragile Tech.
sudo k0s install controller
sudo systemctl start k0scontroller.service
sudo k0s token create --role=worker
sudo snap install kubectl --classic
sudo cp /var/lib/k0s/pki/admin.conf .
sudo chown $(whoami) admin.conf
export KUBECONFIG=$(pwd)/admin.conf
export clusterUser=$(whoami)
kubectl create clusterrolebinding $clusterUser-admin-binding \
--clusterrole=admin --user=$clusterUser
mkdir -p ~/.kube && export KUBECONFIG=
sudo k0s kubeconfig create --groups "system:masters" \
$clusterUser > ~/.kube/config
k0s install worker --token-file /path/to/token/file
# debug sudo ctr --address /run/k0s/containerd.sock -n k8s.io
kubectl logs
and kubectl exec
time out with 80% chance
ImagePullBackOff
on system pods with 50% chance
metadata:
spec:
...
containers:
- ...
limits:
nvidia.com/gpu: 2
# ls /dev/nvidia0
/dev/nvidia0 /dev/nvidia1
nvidia-smi
to rc.local
🤦♂️containerd
configuration on the worker nodes for /run/k0s/containerd*
nvidia.com/gpu
resourcekubectl label nodes --all cloud.google.com/gke-accelerator=true
$kubectl get daemonset -n kube-system nvidia-gpu-device-plugin
NAME DESIRED CURRENT READY UP-TO-DATE nvidia-gpu-device-plugin 3 3 3 3
Need GPU overcommit for the lab 🤔
kubectl patch storageclass local-path -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}'
$ kubectl get storageclass NAME PROVISIONER RECLAIMPOLICY AGE local-path (default) rancher.io/local-path Delete 13d
export BASE_DIR=/opt
export KF_NAME=my-kubeflow
export KF_DIR=${BASE_DIR}/${KF_NAME}
# Download kfctl from https://github.com/kubeflow/kfctl/releases
kfctl apply -V -f kfctl_istio_dex.yaml
CrashLoopBackOff
in Katib
kubectl port-forward svc/istio-ingressgateway -n istio-system 8080:80