高新 | AI平台开发工程师

Installation

部署脚本: https://gitee.com/MLcore-Engine/ai-infra-deploy

update core

#配置阿里yum源命令
curl -o /etc/yum.repos.d/CentOS-Base.repo http://mirrors.aliyun.com/repo/Centos-7.repo
curl -o /etc/yum.repos.d/epel.repo http://mirrors.aliyun.com/repo/epel-7.repo


#运行以下命令生成缓存

yum clean all
yum makecache
yum -y update

//load pubilc key install elrepo
rpm -import https://www.elrepo.org/RPM-GPG-KEY-elrepo.org
rpm -Uvh http://www.elrepo.org/elrepo-release-7.0-3.el7.elrepo.noarch.rpm

//install el repo meta
yum list available --disablerepo='*' --enablerepo=elrepo-kernel
//view available install eprepo
yum --disablerepo=\* --enablerepo=elrepo-kernel list kernel


#install kernel
yum remove kernel-tools-libs.x86_64 kernel-tools.x86_64
#yum -y --enablerepo=elrepo-kernel install kernel-ml.x86_64 kernel-ml-tools.x86_64
sudo yum --enablerepo=elrepo-kernel install kernel-ml kernel-ml-devel

startup kernel sequence

#view all available kernel on the system
awk -F\' '$1=="menuentry " {print $2}' /etc/grub2.cfg
          
#set boot new kernel
grub2-set-default 0

#运行grub2-mkconfig命令来重新创建内核配置
grub2-mkconfig -o /boot/grub2/grub.cfg

reboot

basic environment

systemctl stop firewalld && systemctl disable firewalld
sed -i '/^SELINUX=/c SELINUX=disabled' /etc/selinux/config
setenforce 0

swapoff -a
sed -i 's/^.*centos-swap/#&/g' /etc/fstab

cat << EOF >> /etc/hosts
master 192.168.31.127
kube 192.168.31.128
EOF
//


# 激活 br_netfilter 模块
modprobe br_netfilter
cat << EOF > /etc/modules-load.d/k8s.conf
br_netfilter
EOF

# 内核参数设置:开启IP转发,允许iptables对bridge的数据进行处理
cat << EOF > /etc/sysctl.d/k8s.conf 
net.ipv4.ip_forward = 1
net.bridge.bridge-nf-call-iptables = 1
net.bridge.bridge-nf-call-ip6tables = 1
EOF

# 立即生效
 apt-get install -y apt-transport-https ca-certificates curl gpg


#time sync
#master node
yum install -y chrony
sed -i 's/^server/#&/' /etc/chrony.conf
cat >> /etc/chrony.conf << EOF
server ntp1.aliyun.com iburst
local stratum 10
allow
EOF
systemctl restart chronyd && systemctl enable chronyd

#node
yum install -y chrony
sed -i 's/^server/#&/' /etc/chrony.conf
cat >> /etc/chrony.conf  << EOF
server 172.16.101.11 iburst
EOF
systemctl restart chronyd && systemctl enable chronyd
#找到最大的磁盘  挂载到/data目录
ln -s /data/kubelet /var/lib/kubelet
ln -s /data/docker /var/lib/docker

install docker

#centos
sudo yum remove docker \
                  docker-client \
                  docker-client-latest \
                  docker-common \
                  docker-latest \
                  docker-latest-logrotate \
                  docker-logrotate \
                  docker-engine
                  
sudo yum install -y yum-utils
sudo yum-config-manager --add-repo https://mirrors.aliyun.com/docker-ce/linux/centos/docker-ce.repo
#latest
sudo yum install docker-ce docker-ce-cli containerd.io -y

##ubuntu
for pkg in docker.io docker-doc docker-compose docker-compose-v2 podman-docker containerd runc; do sudo apt-get remove $pkg; done

# Add Docker's official GPG key:
sudo apt-get update
sudo apt-get install ca-certificates curl
sudo install -m 0755 -d /etc/apt/keyrings
sudo curl -fsSL https://mirrors.aliyun.com/docker-ce/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
sudo chmod a+r /etc/apt/keyrings/docker.asc

# Add the repository to Apt sources:
echo \
  "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://mirrors.aliyun.com/docker-ce/linux/ubuntu \
  $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
  sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
sudo apt-get update

#latest
sudo apt-get install docker-ce docker-ce-cli containerd.io 
yum install -y yum-utils device-mapper-persistent-data lvm2
yum-config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo
yum install -y docker-ce-20.10
systemctl enable docker && systemctl start docker 
cat << EOF > /etc/docker/daemon.json
{
  "registry-mirrors": ["https://h3blxdss.mirror.aliyuncs.com"],
  "exec-opts": ["native.cgroupdriver=systemd"],
  "insecure-registries": ["registry-tgq.harbor.com"]
}
EOF

 systemctl daemon-reload && systemctl restart docker

install kubeadm

cat << EOF > /etc/yum.repos.d/kubernetes.repo
[kubernetes]
name=Kubernetes
baseurl=https://mirrors.aliyun.com/kubernetes/yum/repos/kubernetes-el7-x86_64/
enabled=1
gpgcheck=1
repo_gpgcheck=1
gpgkey=https://mirrors.aliyun.com/kubernetes/yum/doc/yum-key.gpg https://mirrors.aliyun.com/kubernetes/yum/doc/rpm-package-key.gpg
EOF

yum install -y --nogpgcheck kubelet-1.23.6 kubeadm-1.23.6 kubectl-1.23.6
yum install --nogpgcheck  --downloadonly --downloaddir=/opt/kubeadm kubelet-1.23.6 kubeadm-1.23.6 kubectl-1.23.6
#yum install -y kubelet kubeadm kubectl
systemctl enable kubelet && systemctl start kubelet
 kubeadm config images list 
 kubeadm config images pull  --image-repository=registry.aliyuncs.com/google_containers  --kubernetes-version=v1.23.6
kubeadm config images list --image-repository=registry.aliyuncs.com/google_containers

init master

kubeadm init --image-repository=registry.aliyuncs.com/google_containers  --kubernetes-version=v1.23.6 --service-cidr=10.1.0.0/16 --pod-network-cidr=10.244.0.0/16
#--apiserver-cert-extra-sans 公网ip或者域名
# 要开始使用集群,您需要以常规用户身份运行以下命令
mkdir -p $HOME/.kube
sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
sudo chown $(id -u):$(id -g) $HOME/.kube/config

# 或者,如果您是root用户,则可以运行允许命令
export KUBECONFIG=/etc/kubernetes/admin.conf
#部署flannel
kubectl apply -f https://raw.githubusercontent.com/coreos/flannel/master/Documentation/kube-flannel.yml
# 注意修改ip pool 与 --pod-network-cidr 一致


# 部署calico
把calico.yaml里pod所在网段改成kubeadm init时选项--pod-network-cidr所指定的网段 
- name: CALICO_IPV4POOL_CIDR
value: "10.244.0.0/16"
# Disable file logging so `kubectl logs` works.
- name: CALICO_DISABLE_FILE_LOGGING
value: "true"

# deploy controller oeprator
kubectl create -f https://raw.githubusercontent.com/projectcalico/calico/v3.26.0/manifests/tigera-operator.yaml

#deploy resource
kubectl create -f https://raw.githubusercontent.com/projectcalico/calico/v3.26.0/manifests/custom-resources.yaml

flannel yaml

---
kind: Namespace
apiVersion: v1
metadata:
  name: kube-flannel
  labels:
    k8s-app: flannel
    pod-security.kubernetes.io/enforce: privileged
---
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
  labels:
    k8s-app: flannel
  name: flannel
rules:
- apiGroups:
  - ""
  resources:
  - pods
  verbs:
  - get
- apiGroups:
  - ""
  resources:
  - nodes
  verbs:
  - get
  - list
  - watch
- apiGroups:
  - ""
  resources:
  - nodes/status
  verbs:
  - patch
- apiGroups:
  - networking.k8s.io
  resources:
  - clustercidrs
  verbs:
  - list
  - watch
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
  labels:
    k8s-app: flannel
  name: flannel
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: flannel
subjects:
- kind: ServiceAccount
  name: flannel
  namespace: kube-flannel
---
apiVersion: v1
kind: ServiceAccount
metadata:
  labels:
    k8s-app: flannel
  name: flannel
  namespace: kube-flannel
---
kind: ConfigMap
apiVersion: v1
metadata:
  name: kube-flannel-cfg
  namespace: kube-flannel
  labels:
    tier: node
    k8s-app: flannel
    app: flannel
data:
  cni-conf.json: |
    {
      "name": "cbr0",
      "cniVersion": "0.3.1",
      "plugins": [
        {
          "type": "flannel",
          "delegate": {
            "hairpinMode": true,
            "isDefaultGateway": true
          }
        },
        {
          "type": "portmap",
          "capabilities": {
            "portMappings": true
          }
        }
      ]
    }
  net-conf.json: |
    {
      "Network": "10.244.0.0/16",
      "Backend": {
        "Type": "vxlan"
      }
    }
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
  name: kube-flannel-ds
  namespace: kube-flannel
  labels:
    tier: node
    app: flannel
    k8s-app: flannel
spec:
  selector:
    matchLabels:
      app: flannel
  template:
    metadata:
      labels:
        tier: node
        app: flannel
    spec:
      affinity:
        nodeAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            nodeSelectorTerms:
            - matchExpressions:
              - key: kubernetes.io/os
                operator: In
                values:
                - linux
      hostNetwork: true
      priorityClassName: system-node-critical
      tolerations:
      - operator: Exists
        effect: NoSchedule
      serviceAccountName: flannel
      initContainers:
      - name: install-cni-plugin
        image: docker.io/flannel/flannel-cni-plugin:v1.1.2
       #image: docker.io/rancher/mirrored-flannelcni-flannel-cni-plugin:v1.1.2
        command:
        - cp
        args:
        - -f
        - /flannel
        - /opt/cni/bin/flannel
        volumeMounts:
        - name: cni-plugin
          mountPath: /opt/cni/bin
      - name: install-cni
        image: docker.io/flannel/flannel:v0.22.0
       #image: docker.io/rancher/mirrored-flannelcni-flannel:v0.22.0
        command:
        - cp
        args:
        - -f
        - /etc/kube-flannel/cni-conf.json
        - /etc/cni/net.d/10-flannel.conflist
        volumeMounts:
        - name: cni
          mountPath: /etc/cni/net.d
        - name: flannel-cfg
          mountPath: /etc/kube-flannel/
      containers:
      - name: kube-flannel
        image: docker.io/flannel/flannel:v0.22.0
       #image: docker.io/rancher/mirrored-flannelcni-flannel:v0.22.0
        command:
        - /opt/bin/flanneld
        args:
        - --ip-masq
        - --kube-subnet-mgr
        resources:
          requests:
            cpu: "100m"
            memory: "50Mi"
        securityContext:
          privileged: false
          capabilities:
            add: ["NET_ADMIN", "NET_RAW"]
        env:
        - name: POD_NAME
          valueFrom:
            fieldRef:
              fieldPath: metadata.name
        - name: POD_NAMESPACE
          valueFrom:
            fieldRef:
              fieldPath: metadata.namespace
        - name: EVENT_QUEUE_DEPTH
          value: "5000"
        volumeMounts:
        - name: run
          mountPath: /run/flannel
        - name: flannel-cfg
          mountPath: /etc/kube-flannel/
        - name: xtables-lock
          mountPath: /run/xtables.lock
      volumes:
      - name: run
        hostPath:
          path: /run/flannel
      - name: cni-plugin
        hostPath:
          path: /opt/cni/bin
      - name: cni
        hostPath:
          path: /etc/cni/net.d
      - name: flannel-cfg
        configMap:
          name: kube-flannel-cfg
      - name: xtables-lock
        hostPath:
          path: /run/xtables.lock
          type: FileOrCreate

calico yaml

# This section includes base Calico installation configuration.
# For more information, see: https://projectcalico.docs.tigera.io/master/reference/installation/api#operator.tigera.io/v1.Installation
apiVersion: operator.tigera.io/v1
kind: Installation
metadata:
  name: default
spec:
  # Configures Calico networking.
  calicoNetwork:
    # Note: The ipPools section cannot be modified post-install.
    ipPools:
    - blockSize: 26
      cidr: 192.168.0.0/16
      encapsulation: VXLANCrossSubnet
      natOutgoing: Enabled
      nodeSelector: all()

---

# This section configures the Calico API server.
# For more information, see: https://projectcalico.docs.tigera.io/master/reference/installation/api#operator.tigera.io/v1.APIServer
apiVersion: operator.tigera.io/v1
kind: APIServer
metadata:
  name: default
spec: {}

add node

#get token and ca
kubeadm token list | awk -F" " '{print $1}' |tail -n 1
openssl x509 -pubkey -in /etc/kubernetes/pki/ca.crt | openssl rsa -pubin -outform der 2>/dev/null | openssl dgst -sha256 -hex | sed 's/^ .* //'


kubeadm join 192.168.31.127:6443 --token <toekn> --discovery-token-ca-cert-hash sha256:<ca>

GPU

install nvidia-docker2

#Setup the repository and the GPG key:
distribution=$(. /etc/os-release;echo $ID$VERSION_ID) \
   && curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.repo | sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo
sudo yum clean expire-cache
sudo yum install -y nvidia-docker2
sudo systemctl restart docker
sudo docker run --rm --gpus all nvidia/cuda:11.6.2-base-ubuntu20.04 nvidia-smi

install nvidia-driver for centos7

#update version
sudo yum clean all
sudo yum update

#验证系统内核版本和安装开发包
sudo yum install -y gcc gcc-c++ kernel-devel-$(uname -r) kernel-headers-$(uname -r)

#验证gcc的版本 
gcc --version

#由于CUDA 11.3要求GCC的版本是6以上,下面是安装GCC7的脚本
sudo yum install centos-release-scl
sudo yum install devtoolset-7
# launch a new shell instance using the Software Collection scl tool:
scl enable devtoolset-7 bash
gcc --version

#检查当前驱动情况
sudo yum install https://www.elrepo.org/elrepo-release-7.el7.elrepo.noarch.rpm
sudo yum install nvidia-detect      # 安装nvida-detect
nvidia-detect -v                    # 检测能够升级到的驱动器版本
cat /proc/driver/nvidia/version     # 查看当前驱动版本

#卸载之前驱动。如果第一次安装,忽略
sudo /usr/bin/nvidia-uninstall

#屏蔽nouveau显卡驱动,把nvidiafb从屏蔽列表中移除
sudo rm -rf  disable-nouveau.conf
cat << EOF > disable-nouveau.conf
blacklist nouveau
options nouveau modeset=0

EOF
   
sudo chown root:root disable-nouveau.conf
sudo chmod 644 disable-nouveau.conf
sudo mv disable-nouveau.conf /etc/modprobe.d/
   
cat /etc/modprobe.d/disable-nouveau.conf
ll /etc/modprobe.d/disable-nouveau.conf


#重建 initramfs 镜像
sudo systemctl set-default multi-user.target    #设置运行级别为文本模式
sudo shutdown -r now
             

#下载驱动包 NVIDIA-Linux-x86_64-525.60.13.run (https://www.nvidia.com/Download/index.aspx?lang=en-us)
NVIDIA-Linux-x86_64-525.60.13.run
sudo rpm -ivh http://elrepo.org/linux/kernel/el7/x86_64/RPMS/kernel-ml-devel-6.3.2-1.el7.elrepo.x86_64.rpm

lsmod | grep nouveau                            #查看nouveau是否已经禁用, 应该没有返回内容
nvidia_run=NVIDIA-Linux-x86_64-460.84.run
chmod 755 $nvidia_run
sudo  ./$nvidia_run
sudo systemctl set-default graphical.target     #设置运行级别回图形模式
sudo systemctl get-default
sudo shutdown -r now

#查看是否安装成功
cat /proc/driver/nvidia/version    
nvidia-smi

磁盘分区挂载

parted

$ parted /dev/sdb mklabel gpt

$ parted /dev/sdb mkpart primary xfs 0% 100%

$ mkfs.xfs /dev/sdb1

$ mount /dev/sdb1 /data

$ df -hT /data

文件系统       类型  容量  已用  可用 已用% 挂载点

/dev/sdb1      xfs   100G   33M  100G    1% /data

$ vim /etc/fstab

/dev/sdb1   /data   xfs   defaults   0 0

# 实现开机自动挂载

fdisk

fdisk /dev/sdb
n 进入分区状态
p 主分区
分区大小 默认即可
w 保存
mke2fs -t ext4 /dev/sdb1
mount /dev/sdb1 ~/newpath
修改  /etc/fstab 
UUID=c61117ca-9176-4d0b-be4d-1b0f434359a7  /newpath  ext4  defaults  0  0 
UUID 的获取可以通过这个命令 blkid /dev/sdb1
最后执行 mount -a 

节点集群间迁移

kubeadm reset
systemctl stop kubelet
systemctl stop docker
rm -rf /var/lib/cni/
rm -rf /var/lib/kubelet/*
rm -rf /etc/cni/
rm -rf /var/lib/etcd/*
ifconfig cni0 down
ifconfig flannel.1 down
ifconfig docker0 down
ip link set cni0 down && ip link set flannel.1 down 
ip link delete cni0 && ip link delete flannel.1
systemctl restart docker && systemctl restart kubelet


rm -rf /root/.kube/config
然后执行命令加入到其他集群

master重新加入集群

#我们有时候会有删除master节点,再重新加入master节点的需求,比如master机器改名。这里注意重新加入时,经常会出现etcd报错
[check-etcd] Checking that the etcd cluster is healthy error executionini phase check-etcd: etcd cluster is not healthy: failed to dial endpoint https://ip:2379 with maintenance client: context deadline exceeded

#这个时候,就需要去还没有停止的master节点里的etcd的pod里去,删除该老master节点对应的etcd信息
kubectl drain master01
kubectl delete node master01

#master01 执行
kubeadm reset
rm -rf /etc/kubernetes/manifests/

kubectl exec -it etcd-master02 sh
etcdctl --endpoints 127.0.0.1:2379 --cacert /etc/kubernetes/pki/etcd/ca.crt --cert /etc/kubernetes/pki/etcd/server.crt --key /etc/kubernetes/pki/etcd/server.key member list

#找到对应的hash
etcdctl --endpoints 127.0.0.1:2379 --cacert /etc/kubernetes/pki/etcd/ca.crt --cert /etc/kubernetes/pki/etcd/server.crt --key /etc/kubernetes/pki/etcd/server.key member remove 12637f5ec2bd02b8



kubeadm init phase upload-certs --upload-certs  # 返回certificates-key


#执行加入命令
You can now join any number of the control-plane node running the following command on each as root:

  kubeadm join 172.16.101.211:9443 --token abcdef.0123456789abcdef \
        --discovery-token-ca-cert-hash sha256:ae579faf241a307a860d0a9e9ba1e308fe0e7a6006b90ece97eb42dfe9fc59b8 \
        --control-plane --certificate-key 79d731d185a93121e73899c10445f5fcaeac8d33155f5402c48bed5543f59e3b

Please note that the certificate-key gives access to cluster sensitive data, keep it secret!
As a safeguard, uploaded-certs will be deleted in two hours; If necessary, you can use
"kubeadm init phase upload-certs --upload-certs" to reload certs afterward.

Then you can join any number of worker nodes by running the following on each as root:

kubeadm join 172.16.101.211:9443 --token abcdef.0123456789abcdef \
        --discovery-token-ca-cert-hash sha256:ae579faf241a307a860d0a9e9ba1e308fe0e7a6006b90ece97eb42dfe9fc59b8


  kubeadm join 172.16.101.211:9443 --token abcdef.0123456789abcdef \
        --discovery-token-ca-cert-hash sha256:ae579faf241a307a860d0a9e9ba1e308fe0e7a6006b90ece97eb42dfe9fc59b8 \
        --control-plane --certificate-key 28d8cd85b5b90fe7603c599915521c20dc5ab5e6be28b52d904b44efb91eed19

离线安装教程

https://cloud.tencent.com/developer/article/2165251

离线安装NFS

https://blog.csdn.net/u013014761/article/details/100054241
https://qizhanming.com/blog/2018/08/08/how-to-install-nfs-on-centos-7
https://blog.csdn.net/u013014761/article/details/100054241
https://cloud.tencent.com/developer/article/2254970

keepalived

https://www.cnblogs.com/rexcheny/p/10778567.html

HA

https://www.cnblogs.com/wubolive/p/17140058.html#_label0_0
https://ost.51cto.com/posts/13131
https://www.linuxtechi.com/setup-highly-available-kubernetes-cluster-kubeadm/
https://developer.aliyun.com/article/1136864
https://hevodata.com/learn/kubernetes-high-availability/

tecent article

https://tencentcloudcontainerteam.github.io/2019/08/12/troubleshooting-with-kubernetes-network/
https://tencentcloudcontainerteam.github.io/2019/12/15/no-route-to-host/

kubeadm 离线安装包下载

yum install --downloadonly --downloaddir=/home/centos/k8s kubelet-1.23.6 kubeadm-1.23.6 kubectl-1.23.6
external IP 不兼容 ipvs 问题
https://blog.csdn.net/qq_41586875/article/details/124330823

regenerate admin.conf

kubeadm init phase kubeconfig admin 

更新证书

#查看证书过期时间
kubeadm certs check-expiration

#对于低版本的k8s
kubeadm alpha certs check-expiration

#备份相关文件
cp -r /etc/kubernetes /etc/kubernetes.old

#在三个主节点上执行命令更新证书
kubeadm certs renew all
#低版本执行
kubeadm alpha certs renew all

#在每个master上重启相关服务
docker ps | egrep "k8s_kube-apiserver|k8s_kube-scheduler|k8s_kube-controller" | awk '{print $1}' | xargs docker restart 

#更新 .kube/config
cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
chown $(id -u):$(id -g) $HOME/.kube/config
#参考文献: https://kubernetes.io/zh-cn/docs/tasks/administer-cluster/kubeadm/kubeadm-certs/

升级centos g++

#SCL(Software Collections)是一个CentOS/RHEL Linux平台的软件多版本共存解决方案,为用户提供一种方便、安全地安装和使用应用程序和运行时环境的多个版本的方式。

#Developer Toolset是为CentOS和REHL Linux平台开发者设计的开发工具集,提供GCC工具集、GNU Debugger以及其它开发、调试、性能测试工具的不同版本。

#因此,可以通过安装scl源并下载对应版本的devtoolset来达到管理高版本gcc、g++、gfortran等开发工具的目的。

#安装scl源
yum -y install centos-release-scl

#安装devtoolset-x
#可以根据需要安装指定版本的devtoolset,这里以 devtooset-9 为例。
yum -y install devtoolset-9

#默认安装位置为 /opt 目录
#激活devtoolset

#编辑 ~/.bash_profile,在文件尾部添加如下命令。
source /opt/rh/devtoolset-9/enable

source ~/.bash_profile

#检查是否安装成功
gcc -v