TiDB 通过 Prometheus 和 Grafana 监控 TiDB 集群。在通过 TiDB Operator 创建新的 TiDB 集群时,可以对于每个 TiDB 集群,创建、配置一套独立的监控系统,与 TiDB 集群运行在同一 Namespace,包括 Prometheus 和 Grafana 两个组件。本文是基于v6.1版本,力求一步步通过该文档能快速搭建一个多集群的监控系统。
一、创建监控与告警
1.1、创建PV目录
[root
@k8s-node2 disks
]# for i in `seq 9`; do mkdir -p /home/data/pv0$i; done
[root
@k8s-node2 disks
]# ll /home/data/pv0
pv01
/ pv02
/ pv03
/ pv04
/ pv05
/ pv06
/ pv07
/ pv08
/ pv09
/
1.2、批量创建local PV
[root
@k8s-master monitoring
]#for i in `seq 9`; do
cat
<<EOF
| kubectl
apply -f
-
apiVersion: v1
kind: PersistentVolume
metadata:
name: tidb
-cluster
-172-16-4-203-pv0${i}
spec:
capacity:
storage:
50Gi
volumeMode: Filesystem
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy:
Delete
storageClassName:
local-storage
-monitoring
local:
path:
/home
/data/pv0${i}
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes
.io
/hostname
operator:
In
values:
- k8s
-node2
EOF
done
persistentvolume
/tidb
-cluster
-172-16-4-203-pv01 created
persistentvolume
/tidb
-cluster
-172-16-4-203-pv02 created
persistentvolume
/tidb
-cluster
-172-16-4-203-pv03 created
persistentvolume
/tidb
-cluster
-172-16-4-203-pv04 created
persistentvolume
/tidb
-cluster
-172-16-4-203-pv05 created
persistentvolume
/tidb
-cluster
-172-16-4-203-pv06 created
persistentvolume
/tidb
-cluster
-172-16-4-203-pv07 created
persistentvolume
/tidb
-cluster
-172-16-4-203-pv08 created
persistentvolume
/tidb
-cluster
-172-16-4-203-pv09 created
1.3、检查PV的状态
[root
@k8s-master monitoring
]# kubectl get pv
NAME CAPACITY ACCESS MODES RECLAIM POLICY
STATUS CLAIM STORAGECLASS REASON AGE
tidb
-cluster
-172-16-4-203-pv01
50Gi RWO
Delete Available
local-storage
-monitoring
48s
tidb
-cluster
-172-16-4-203-pv02
50Gi RWO
Delete Available
local-storage
-monitoring
48s
tidb
-cluster
-172-16-4-203-pv03
50Gi RWO
Delete Available
local-storage
-monitoring
47s
tidb
-cluster
-172-16-4-203-pv04
50Gi RWO
Delete Available
local-storage
-monitoring
47s
tidb
-cluster
-172-16-4-203-pv05
50Gi RWO
Delete Available
local-storage
-monitoring
47s
tidb
-cluster
-172-16-4-203-pv06
50Gi RWO
Delete Available
local-storage
-monitoring
46s
tidb
-cluster
-172-16-4-203-pv07
50Gi RWO
Delete Available
local-storage
-monitoring
46s
tidb
-cluster
-172-16-4-203-pv08
50Gi RWO
Delete Available
local-storage
-monitoring
46s
tidb
-cluster
-172-16-4-203-pv09
50Gi RWO
Delete Available
local-storage
-monitoring
46s
1.4、创建监控数据的yaml文件
[root
@k8s-master monitoring
]# cat operator-monitoring.yaml
apiVersion: pingcap
.com
/v1alpha1
kind: TidbMonitor
metadata:
name: basic
spec:
clusters:
- name: mycluster
prometheus:
baseImage: prom
/prometheus
version: v2
.27.1
logLevel: info
reserveDays:
12
service:
type: NodePort
portName: http
-prometheus
grafana:
baseImage: grafana
/grafana
version:
7.5.11
logLevel: info
username: admin
password: admin
envs:
GF_AUTH_ANONYMOUS_ENABLED:
"true"
GF_AUTH_ANONYMOUS_ORG_NAME:
"Main Org."
GF_AUTH_ANONYMOUS_ORG_ROLE:
"Viewer"
service:
type: NodePort
portName: http
-grafana
initializer:
baseImage: pingcap
/tidb
-monitor
-initializer
version: v6
.1.0
reloader:
baseImage: pingcap
/tidb
-monitor
-reloader
version: v1
.0.1
service:
type: NodePort
portName: tcp
-reloader
prometheusReloader:
baseImage: quay
.io
/prometheus
-operator
/prometheus
-config
-reloader
version: v0
.49.0
imagePullPolicy: IfNotPresent
persistent:
true
storageClassName:
local-storage
-monitoring
storage:
10Gi
nodeSelector: {}
annotations: {}
tolerations:
[]
kubePrometheusURL: http:
//prometheus-k8s.monitoring.svc:9090
alertmanagerURL:
""
[root
@k8s-master monitoring
]# kubectl apply -f operator-monitoring.yaml -ntidb
tidbmonitor
.pingcap
.com
/basic created
[root
@k8s-master monitoring
]# kubectl get pod -ntidb
NAME READY
STATUS RESTARTS AGE
basic
-monitor
-0 3/4 Running
0 9s
mycluster
-discovery
-58b658b88d
-wnqgs
1/1 Running
1 133d
mycluster
-pd
-0 1/1 Running
0 3h22m
mycluster
-pd
-1 1/1 Running
0 64m
mycluster
-pd
-2 1/1 Running
0 64m
mycluster
-tidb
-0 2/2 Running
0 3h10m
mycluster
-tidb
-1 2/2 Running
2 81m
mycluster
-tidb
-2 2/2 Running
0 3h14m
mycluster
-tikv
-0 1/1 Running
0 3h14m
mycluster
-tikv
-1 1/1 Running
0 3h16m
mycluster
-tikv
-2 1/1 Running
0 64m
mycluster
-tikv
-3 1/1 Running
0 64m
mycluster
-tikv
-4 1/1 Running
0 64m
mycluster
-tikv
-5 1/1 Running
0 64m
1.5、查看PVC状态
[root
@k8s-master monitoring
]# kubectl get pvc -l app.kubernetes.io/instance=basic,app.kubernetes.io/component=monitor -n tidb
NAME
STATUS VOLUME CAPACITY ACCESS MODES STORAGECLASS AGE
tidbmonitor
-basic
-monitor
-0 Bound tidb
-cluster
-172-16-4-203-pv07
50Gi RWO
local-storage
-monitoring
2m7s
1.6、查看SVC状态
[root
@k8s-master monitoring
]# kubectl get svc -ntidb
NAME
TYPE CLUSTER
-IP EXTERNAL
-IP PORT
(S
) AGE
basic
-grafana NodePort
10.100.15.69 <none
> 3000:
31886/TCP
12m
basic
-monitor
-reloader NodePort
10.104.226.145 <none
> 9089:
30305/TCP
12m
basic
-prometheus NodePort
10.96.172.72 <none
> 9090:
30536/TCP
12m
mycluster
-discovery ClusterIP
10.102.101.73 <none
> 10261/TCP
,10262/TCP
133d
mycluster
-pd ClusterIP
10.98.161.169 <none
> 2379/TCP
133d
mycluster
-pd
-peer ClusterIP None
<none
> 2380/TCP
,2379/TCP
133d
mycluster
-tidb NodePort
10.109.243.39 <none
> 4000:
30020/TCP
,10080:
30040/TCP
133d
mycluster
-tidb
-peer ClusterIP None
<none
> 10080/TCP
133d
mycluster
-tikv
-peer ClusterIP None
<none
> 20160/TCP
133d
1.7、访问grafana和Prometheus控制面板
方法一:通过port
-forward进行访问
[root
@k8s-master monitoring
]# kubectl port-forward -n tidb svc/basic-grafana 3000:3000 &>/tmp/portforward-grafana.log &
[1] 15143
[root
@k8s-master monitoring
]# kubectl port-forward -n tidb svc/basic-prometheus-prometheus 9090:9090 &>/tmp/portforward-prometheus.log &
[2] 16015
方法二:通过nodePort进行访问
二、开启ingress,通过ingress对外访问
2.1、安装ingress
请查看(K8S-Ingress控制器 )
2.2、TidbMonitor开启Prometheus/Grafana服务对外暴露。
[root@k8s
-master tidb
]# cat ingress
-monitor
.yaml
apiVersion
: pingcap
.com
/v1alpha1
kind
: TidbMonitor
metadata
:
name
: ingress
-monitor
namespace
: tidb
spec
:
clusters
:
- name
: lqb
persistent
: true
storageClassName
: local
-storage
storage
: 45G
prometheus
:
baseImage
: prom
/prometheus
version
: v2
.27.1
####Prometheus通过ingress对外暴露配置
ingress
:
hosts
:
- prometheus
.mytest
.org
grafana
:
baseImage
: grafana
/grafana
version
: 7.5.11
###grafana通过ingress对外暴露配置
ingress
:
hosts
:
- grafana
.mytest
.org
initializer
:
baseImage
: pingcap
/tidb
-monitor
-initializer
version
: v6
.1.0
reloader
:
baseImage
: pingcap
/tidb
-monitor
-reloader
version
: v1
.0.1
prometheusReloader
:
baseImage
: quay
.io
/prometheus
-operator
/prometheus
-config
-reloader
version
: v0
.49.0
2.3、应用改配置
[root@k8s
-master tidb
]# kubectl apply
-f ingress
-monitor
.yaml
tidbmonitor
.pingcap
.com
/ingress
-monitor created
[root@k8s
-master tidb
]# kubectl get svc
,pod
-ntidb
NAME TYPE CLUSTER
-IP EXTERNAL
-IP
PORT(S
) AGE
service
/access
-dashboard NodePort
10.98.67.190 <none
> 10262:30836/TCP
15d
service
/ingress
-monitor
-grafana ClusterIP
10.105.79.132 <none
> 3000/TCP
29s
service
/ingress
-monitor
-monitor
-reloader ClusterIP
10.109.154.169 <none
> 9089/TCP
29s
service
/ingress
-monitor
-prometheus ClusterIP
10.105.80.91 <none
> 9090/TCP
29s
service
/lqb
-discovery ClusterIP
10.98.145.163 <none
> 10261/TCP
,10262/TCP
135m
service
/lqb
-pd ClusterIP
10.97.247.56 <none
> 2379/TCP
135m
service
/lqb
-pd
-peer ClusterIP None
<none
> 2380/TCP
,2379/TCP
135m
service
/lqb
-tidb NodePort
10.97.69.112 <none
> 4000:30022/TCP
,10080:30042/TCP
135m
service
/lqb
-tidb
-peer ClusterIP None
<none
> 10080/TCP
135m
service
/lqb
-tikv
-peer ClusterIP None
<none
> 20160/TCP
135m
service
/monitor
-grafana NodePort
10.108.176.207 <none
> 3000:31805/TCP
80m
service
/monitor
-monitor
-reloader ClusterIP
10.109.183.102 <none
> 9089/TCP
80m
service
/monitor
-prometheus NodePort
10.107.252.241 <none
> 9090:30398/TCP
80m
2.4、添加hosts解析并对外访问(通过${node_ip}:${NodePort})
三、多集群监控
3.1、通过修改yaml文件配置多集群监控
[root@k8s
-master tidb
]# cat monitor
.yaml
apiVersion
: pingcap
.com
/v1alpha1
kind
: TidbMonitor
metadata
:
name
: monitor
namespace
: tidb
spec
:
###配置多集群监控的配置,关键字段有namespace和集群的名称
clusters
:
- name
: lqb
namespace
: tidb
- name
: yz
namespace
: tidb
clusterScoped
: true ###
false管理特定namespaces下集群,
true管理所有集群
persistent
: true
storageClassName
: local
-storage
storage
: 45G
prometheus
:
baseImage
: prom
/prometheus
version
: v2
.27.1
service
:
type: NodePort
grafana
:
baseImage
: grafana
/grafana
version
: 7.5.11
service
:
type: NodePort
initializer
:
baseImage
: pingcap
/tidb
-monitor
-initializer
version
: v6
.1.0
reloader
:
baseImage
: pingcap
/tidb
-monitor
-reloader
version
: v1
.0.1
prometheusReloader
:
baseImage
: quay
.io
/prometheus
-operator
/prometheus
-config
-reloader
version
: v0
.49.0
imagePullPolicy
: IfNotPresent
3.2、应用改配置
[root@k8s
-master tidb
]# kubectl apply
-f monitor
.yaml
tidbmonitor
.pingcap
.com
/monitor configured
[root@k8s
-master tidb
]#kubectl describe tidbmonitor
/monitor
-ntidb
3.3、查看监控
总结
通过operator进行监控系统的部署,通过一套监控监控多个集群,并让监控数据持久化能更好的存储数据,需要了解关于pv的相关知识。