1.四個配置文件
[root@kubemaster01 alertmanager]# ls -l -rw-r--r-- 1 root root 676 Oct 28 15:43 alertmanager-configmap.yaml -rw-r--r-- 1 root root 2183 Oct 28 15:36 alertmanager-deployment.yaml -rw-r--r-- 1 root root 331 Oct 28 15:36 alertmanager-pvc.yaml -rw-r--r-- 1 root root 372 Oct 28 15:36 alertmanager-service.yaml
2.修改pv 以及 config的地址
[root@kubemaster01 alertmanager]# cat alertmanager-pvc.yaml apiVersion: v1 kind: PersistentVolumeClaim metadata: name: alertmanager namespace: kube-system labels: kubernetes.io/cluster-service: "true" addonmanager.kubernetes.io/mode: EnsureExists spec: storageClassName: managed-nfs-storage accessModes: - ReadWriteOnce resources: requests: storage: "2Gi" [root@kubemaster01 alertmanager]# cat alertmanager-configmap.yaml apiVersion: v1 kind: ConfigMap metadata: name: alertmanager-config namespace: kube-system labels: kubernetes.io/cluster-service: "true" addonmanager.kubernetes.io/mode: EnsureExists data: alertmanager.yml: | global: resolve_timeout: 5m smtp_smarthost: 'smtp.163.com:25' smtp_from: '[email protected]' smtp_auth_username: '[email protected]' smtp_auth_password: '123456' smtp_require_tls: false receivers: - name: default-receiver email_configs: - to: "[email protected]" route: group_interval: 1m group_wait: 10s receiver: default-receiver repeat_interval: 1m [root@kubemaster01 alertmanager]#
3.部署
kubectl apply -f alertmanager-configmap.yaml kubectl apply -f alertmanager-pvc.yaml kubectl apply -f alertmanager-deployment.yaml kubectl apply -f alertmanager-service.yaml
4.Prometheus和alertmanager 通訊配置
修改prometheus的配置config-map 然後從新運用
5.查看是否生效
6.修改configmap 修改prometheus的報警規則的
(kubectl apply -f prometheus-configmap.yaml)
創建configmap
kubectl apply -f prometheus-rules.yaml
[root@kubemaster01 prometheus]# cat prometheus-rules.yaml apiVersion: v1 kind: ConfigMap metadata: name: prometheus-rules namespace: kube-system data: general.rules: | groups: - name: general.rules rules: - alert: InstanceDown expr: up == 0 for: 1m labels: severity: error annotations: summary: "Instance {{ $labels.instance }} 停止工作" description: "{{ $labels.instance }} job {{ $labels.job }} 已經停止5分鐘以上." node.rules: | groups: - name: node.rules rules: - alert: NodeFilesystemUsage expr: 100 - (node_filesystem_free_bytes{fstype=~"ext4|xfs"} / node_filesystem_size_bytes{fstype=~"ext4|xfs"} * 100) > 80 for: 1m labels: severity: warning annotations: summary: "Instance {{ $labels.instance }} : {{ $labels.mountpoint }} 分區使用率過高" description: "{{ $labels.instance }}: {{ $labels.mountpoint }} 分區使用大於80% (當前值: {{ $value }})" - alert: NodeMemoryUsage expr: 100 - (node_memory_MemFree_bytes+node_memory_Cached_bytes+node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100 > 80 for: 1m labels: severity: warning annotations: summary: "Instance {{ $labels.instance }} 內存使用率過高" description: "{{ $labels.instance }}內存使用大於80% (當前值: {{ $value }})" - alert: NodeCPUUsage expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) * 100) > 60 for: 1m labels: severity: warning annotations: summary: "Instance {{ $labels.instance }} CPU使用率過高" description: "{{ $labels.instance }}CPU使用大於60% (當前值: {{ $value }})" [root@kubemaster01 prometheus]#
prometheus服務掛載configmap