一. 基于Operator和二進制安裝prometheus環(huán)境
1.1 Operator部署
1.1.1 下載項目文件
# wget https://codeload.github.com/prometheus-operator/kube-prometheus/zip/refs/heads/release-0.12
# unzip kube-prometheus-release-0.12.zip
# cd kube-prometheus-release-0.12/manifests/
1.1.2 查看對應(yīng)yaml文件所需鏡像
# grep -R 'image: ' ./*
./alertmanager-alertmanager.yaml: image: quay.io/prometheus/alertmanager:v0.25.0
./blackboxExporter-deployment.yaml: image: quay.io/prometheus/blackbox-exporter:v0.23.0
./blackboxExporter-deployment.yaml: image: jimmidyson/configmap-reload:v0.5.0
./blackboxExporter-deployment.yaml: image: quay.io/brancz/kube-rbac-proxy:v0.14.0
./grafana-deployment.yaml: image: grafana/grafana:9.3.16
./kubeStateMetrics-deployment.yaml: image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.7.0
./kubeStateMetrics-deployment.yaml: image: quay.io/brancz/kube-rbac-proxy:v0.14.0
./kubeStateMetrics-deployment.yaml: image: quay.io/brancz/kube-rbac-proxy:v0.14.0
./nodeExporter-daemonset.yaml: image: quay.io/prometheus/node-exporter:v1.5.0
./nodeExporter-daemonset.yaml: image: quay.io/brancz/kube-rbac-proxy:v0.14.0
./prometheusAdapter-deployment.yaml: image: registry.k8s.io/prometheus-adapter/prometheus-adapter:v0.10.0
./prometheusOperator-deployment.yaml: image: quay.io/prometheus-operator/prometheus-operator:v0.62.0
./prometheusOperator-deployment.yaml: image: quay.io/brancz/kube-rbac-proxy:v0.14.0
./prometheus-prometheus.yaml: image: quay.io/prometheus/prometheus:v2.41.0
1.1.3 提前下載到時無法直接下載的鏡像,并上傳至本地倉庫
# nerdctl pull bitnami/kube-state-metrics:2.7.0
# nerdctl tag bitnami/kube-state-metrics:2.7.0 harbor.zhao.net/baseimages/kube-state-metrics:2.7.0
# nerdctl push harbor.zhao.net/baseimages/kube-state-metrics:2.7.0
# nerdctl pull v5cn/prometheus-adapter:v0.10.0
# nerdctl tag v5cn/prometheus-adapter:v0.10.0 harbor.zhao.net/baseimages/prometheus-adapter:v0.10.0
# nerdctl push harbor.zhao.net/baseimages/prometheus-adapter:v0.10.0
1.1.4 修改yaml文件鏡像名稱
# sed -i 's@registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.7.0@harbor.chu.net/baseimages/kube-state-metrics:2.7.0@g' kubeStateMetrics-deployment.yaml
# sed -i 's@registry.k8s.io/prometheus-adapter/prometheus-adapter:v0.10.0@harbor.chu.net/baseimages/prometheus-adapter:v0.10.0@g' prometheusAdapter-deployment.yaml
1.1.5 執(zhí)行創(chuàng)建
#先創(chuàng)建資源
# kubectl create -f setup/
#創(chuàng)建服務(wù)
# kubectl apply -f ./
查看pod狀態(tài)
# kubectl get pod -n monitoring
NAME READY STATUS RESTARTS AGE
alertmanager-main-0 2/2 Running 0 29s
alertmanager-main-1 2/2 Running 0 29s
alertmanager-main-2 2/2 Running 0 29s
blackbox-exporter-6fd586b445-hsx6r 3/3 Running 0 65s
grafana-6849bbf859-jlx5z 1/1 Running 0 56s
kube-state-metrics-68c68774f-bcq8b 3/3 Running 0 53s
node-exporter-4cr2w 2/2 Running 0 50s
node-exporter-b8qtk 2/2 Running 0 50s
node-exporter-f6rl2 2/2 Running 0 50s
node-exporter-sbtn9 2/2 Running 0 50s
node-exporter-vlfzj 2/2 Running 0 50s
node-exporter-zknth 2/2 Running 0 50s
prometheus-adapter-5bcc998b6d-7mw6w 1/1 Running 0 40s
prometheus-adapter-5bcc998b6d-9cstl 1/1 Running 0 40s
prometheus-k8s-0 2/2 Running 0 23s
prometheus-k8s-1 2/2 Running 0 22s
prometheus-operator-776c6c6b87-7b7m2 2/2 Running 0 38s
查看service
# kubectl get svc -n monitoring
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
alertmanager-main ClusterIP 10.100.82.164 <none> 9093/TCP,8080/TCP 6m17s
alertmanager-operated ClusterIP None <none> 9093/TCP,9094/TCP,9094/UDP 5m39s
blackbox-exporter ClusterIP 10.100.115.85 <none> 9115/TCP,19115/TCP 6m15s
grafana ClusterIP 10.100.52.29 <none> 3000/TCP 6m7s
kube-state-metrics ClusterIP None <none> 8443/TCP,9443/TCP 6m4s
node-exporter ClusterIP None <none> 9100/TCP 6m1s
prometheus-adapter ClusterIP 10.100.22.207 <none> 443/TCP 5m51s
prometheus-k8s ClusterIP 10.100.231.69 <none> 9090/TCP,8080/TCP 5m54s
prometheus-operated ClusterIP None <none> 9090/TCP 5m33s
prometheus-operator ClusterIP None <none> 8443/TCP 5m48s
默認(rèn)安裝有相關(guān)的網(wǎng)絡(luò)策略油湖,可先刪除相關(guān)策略旭从,調(diào)試完成后轩端,再根據(jù)實際需求進行修改
# for i in `ls |grep network`;do kubectl delete -f $i;done
networkpolicy.networking.k8s.io "alertmanager-main" deleted
networkpolicy.networking.k8s.io "blackbox-exporter" deleted
networkpolicy.networking.k8s.io "grafana" deleted
networkpolicy.networking.k8s.io "kube-state-metrics" deleted
networkpolicy.networking.k8s.io "node-exporter" deleted
networkpolicy.networking.k8s.io "prometheus-adapter" deleted
networkpolicy.networking.k8s.io "prometheus-k8s" deleted
networkpolicy.networking.k8s.io "prometheus-operator" deleted
root@k8s-master2:~/kube-prometheus-release-0.12/manifests# vim prometheus-service.yaml
root@k8s-master2:~/kube-prometheus-release-0.12/manifests# kubectl apply -f prometheus-service.yaml
service/prometheus-k8s configured
1.1.6 驗證Prometheus Web頁面
將prometheus-service.yaml文件中service type更改為NodePort
# vim prometheus-service.yaml
apiVersion: v1
kind: Service
metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/instance: k8s
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.41.0
name: prometheus-k8s
namespace: monitoring
spec:
type: NodePort #添加NodePort
ports:
- name: web
port: 9090
targetPort: web
nodePort: 39090 #設(shè)置端口號
- name: reloader-web
port: 8080
targetPort: reloader-web
nodePort: 38080 #設(shè)置端口號
selector:
app.kubernetes.io/component: prometheus
app.kubernetes.io/instance: k8s
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
sessionAffinity: ClientIP
# kubectl apply -f prometheus-service.yaml
# kubectl get svc -n monitoring |grep prometheus
prometheus-adapter ClusterIP 10.100.22.207 <none> 443/TCP 19m
prometheus-k8s NodePort 10.100.231.69 <none> 9090:39090/TCP,8080:38080/TCP 19m
prometheus-operated ClusterIP None <none> 9090/TCP 19m
prometheus-operator ClusterIP None <none> 8443/TCP 19m
查看status
1.1.7 驗證grafana Web頁面
將grafana-service.yaml文件中service type更改為NodePort
# vim grafana-service.yaml
apiVersion: v1
kind: Service
metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 9.3.16
name: grafana
namespace: monitoring
spec:
type: NodePort #添加NodePort類型
ports:
- name: http
port: 3000
targetPort: http
nodePort: 33000 #設(shè)置端口號
selector:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
# kubectl apply -f grafana-service.yaml
# kubectl get svc -n monitoring|grep 33000
grafana NodePort 10.100.52.29 <none> 3000:33000/TCP 5h2m
瀏覽器訪問,默認(rèn)用戶名洗贰、密碼(admin:admin)
1.2 二進制部署
1.2.1 下載prometheus server二進制程序
# mkdir /apps
# cd /apps
# wget https://github.com/prometheus/prometheus/releases/download/v2.45.0/prometheus-2.45.0.linux-amd64.tar.gz
# tar -xvf prometheus-2.45.0.linux-amd64.tar.gz
# ln -s /apps/prometheus-2.45.0.linux-amd64 /apps/prometheus
1.2.2 啟動prometheus服務(wù)
- 創(chuàng)建service文件
# vim /etc/systemd/system/prometheus.service
[Unit]
Description=Prometheus Server
Documentation=https://prometheus.io/docs/introduction/overview/
After=network.target
[Service]
Restart=on-failure
WorkingDirectory=/apps/prometheus/
ExecStart=/apps/prometheus/prometheus --config.file=/apps/prometheus/prometheus.yml --web.enable-lifecycle #--web.enable-lifecycle表示動態(tài)加載配置,可以使用用命令 curl -X POST http://localhost:9090/-/reload 重新加載配置文件
[Install]
WantedBy=multi-user.target
- 啟動服務(wù)
# systemctl daemon-reload
# systemctl start prometheus.service
# systemctl enable prometheus.service
# ss -ntpl |grep 9090
LISTEN 0 4096 *:9090 *:* users:(("prometheus",pid=55786,fd=7))
1.2.3 驗證prometheus web界面
二. 通過node-exporter和cadvisor收集指標(biāo)數(shù)據(jù)
2.1 node-exporter
k8s各node節(jié)點安裝node-exporter(二進制或daemonset方式),用于收集各k8s節(jié)點宿主機的監(jiān)控指標(biāo)數(shù)據(jù)常挚,默認(rèn)監(jiān)聽端口為9100
2.1.1 使用daemonset方式部署node-exporter
如果k8s環(huán)境已經(jīng)部署了promethues node-exporter蛙吏,需要先停止容器或者修改相關(guān)端口
2.1.1.1 編寫yaml文件
# vim case2-daemonset-deploy-node-exporter.yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: node-exporter
namespace: monitoring
labels:
k8s-app: node-exporter
spec:
selector:
matchLabels:
k8s-app: node-exporter
template:
metadata:
labels:
k8s-app: node-exporter
spec:
tolerations:
- effect: NoSchedule
key: node-role.kubernetes.io/master
containers:
- image: prom/node-exporter:v1.3.1
imagePullPolicy: IfNotPresent
name: prometheus-node-exporter
ports:
- containerPort: 9100
hostPort: 9100
protocol: TCP
name: metrics
volumeMounts:
- mountPath: /host/proc
name: proc
- mountPath: /host/sys
name: sys
- mountPath: /host
name: rootfs
args:
- --path.procfs=/host/proc
- --path.sysfs=/host/sys
- --path.rootfs=/host
volumes:
- name: proc
hostPath:
path: /proc
- name: sys
hostPath:
path: /sys
- name: rootfs
hostPath:
path: /
hostNetwork: true
hostPID: true
---
apiVersion: v1
kind: Service
metadata:
annotations:
prometheus.io/scrape: "true"
labels:
k8s-app: node-exporter
name: node-exporter
namespace: monitoring
spec:
type: NodePort
ports:
- name: http
port: 9100
nodePort: 39100
protocol: TCP
selector:
k8s-app: node-exporter
2.1.1.2 執(zhí)行創(chuàng)建
# kubectl create ns monitoring
# kubectl apply -f case2-daemonset-deploy-node-exporter.yaml
#查看pod
# kubectl get pod -n monitoring -o wide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
node-exporter-2n922 1/1 Running 0 8m54s 172.20.20.113 172.20.20.113 <none> <none>
node-exporter-9sbtt 1/1 Running 0 6m42s 172.20.20.102 master-02 <none> <none>
node-exporter-pj8zw 1/1 Running 0 8m54s 172.20.20.112 worker-02 <none> <none>
node-exporter-vd9qx 1/1 Running 0 8m54s 172.20.20.101 master-01 <none> <none>
node-exporter-vfvkq 1/1 Running 0 8m54s 172.20.20.103 172.20.20.103 <none> <none>
node-exporter-wldvl 1/1 Running 0 8m54s 172.20.20.111 worker-01 <none> <none>
#查看service
# kubectl get svc -n monitoring
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
node-exporter NodePort 10.100.107.25 <none> 9100:39100/TCP 3m45s
2.1.1.3 查看web頁面
訪問 宿主機IP:9100/metrics
常見指標(biāo)
node_boot_time 系統(tǒng)自啟動以后的總運行時間
node_cpu 系統(tǒng)CPU使用量
node_disk* 磁盤IO
node_filesystem* 系統(tǒng)文件使用量
node_load1 系統(tǒng)CPU負(fù)載
node_memory* 內(nèi)存使用量
node_network* 網(wǎng)絡(luò)帶寬指標(biāo)
go_* node exporter中g(shù)o相關(guān)指標(biāo)
process_* node exporter自身進程相關(guān)運行指標(biāo)
2.1.2 prometheus server收集node-exporter數(shù)據(jù)
2.1.2.1 修改Prometheus配置文件
# vim /apps/prometheus/prometheus.yml
#全局配置
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
#告警配置
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
# - alertmanager:9093
#規(guī)則配置
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
#數(shù)據(jù)采集配置
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: "prometheus"
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ["localhost:9090"]
- job_name: "node"
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ["172.20.20.202:9100","172.20.20.203:9100"]
#添加的k8s集群node節(jié)點
- job_name: "k8s-node"
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ["172.20.20.101:9100","172.20.20.102:9100","172.20.20.103:9100","172.20.20.111:9100","172.20.20.112:9100","172.20.20.113:9100"]
#重啟服務(wù)
# systemctl restart prometheus.service
2.1.2.2 查看Prometheus web狀態(tài)
上面添加的k8s節(jié)點的數(shù)據(jù)已經(jīng)上傳至Prometheus
2.1.2.3 驗證node數(shù)據(jù)
2.2 cadvisor
cadvisor(容器顧問)不僅可以收集一臺機器上所有運行的容器信息源哩,還提供基礎(chǔ)查詢界面和http接口,方便其他組件如prometheus進行數(shù)據(jù)抓取鸦做,cadvisor可以對節(jié)點機器上的容器進行實時監(jiān)控和性能數(shù)據(jù)采集璧疗,包括容器的CPU使用情況、內(nèi)存使用情況馁龟、網(wǎng)絡(luò)吞吐量及文件系統(tǒng)使用情況
2.2.1 daemonset方式部署cadvisor
2.2.1.1 準(zhǔn)備鏡像
# nerdctl pull registry.cn-hangzhou.aliyuncs.com/zhangshijie/cadvisor-amd64:v0.39.3
# nerdctl tag registry.cn-hangzhou.aliyuncs.com/zhangshijie/cadvisor-amd64:v0.39.3 harbor.zhao.net/baseimages/cadvisor-amd64:v0.39.3
# nerdctl push harbor.zhao.net/baseimages/cadvisor-amd64:v0.39.3
2.2.1.2 準(zhǔn)備yaml文件并執(zhí)行創(chuàng)建
# vim case1-daemonset-deploy-cadvisor.yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: cadvisor
namespace: monitoring
spec:
selector:
matchLabels:
app: cAdvisor
template:
metadata:
labels:
app: cAdvisor
spec:
tolerations: #污點容忍,忽略master的NoSchedule
- effect: NoSchedule
key: node-role.kubernetes.io/master
hostNetwork: true
restartPolicy: Always # 重啟策略
containers:
- name: cadvisor
image: harbor.zhao.net/baseimages/cadvisor-amd64:v0.39.3
imagePullPolicy: IfNotPresent # 鏡像策略
ports:
- containerPort: 8080
volumeMounts:
- name: root
mountPath: /rootfs
- name: run
mountPath: /var/run
- name: sys
mountPath: /sys
- name: docker
mountPath: /var/lib/containerd
volumes:
- name: root
hostPath:
path: /
- name: run
hostPath:
path: /var/run
- name: sys
hostPath:
path: /sys
- name: docker
hostPath:
path: /var/lib/containerd
# kubectl apply -f case1-daemonset-deploy-cadvisor.yaml
#查看狀態(tài)
# kubectl get pod -n monitoring -o wide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
cadvisor-66hh6 1/1 Running 0 4m19s 172.20.20.113 172.20.20.113 <none> <none>
cadvisor-7bp45 1/1 Running 0 4m19s 172.20.20.103 172.20.20.103 <none> <none>
cadvisor-9gbtt 1/1 Running 0 4m19s 172.20.20.102 master-02 <none> <none>
cadvisor-bxl6p 1/1 Running 0 4m19s 172.20.20.112 worker-02 <none> <none>
cadvisor-ql2qz 1/1 Running 0 4m19s 172.20.20.101 master-01 <none> <none>
cadvisor-rxv8m 1/1 Running 0 4m19s 172.20.20.111 worker-01 <none> <none>
2.2.1.3 查看web頁面
-
瀏覽器直接訪問 宿主機IP:8080 查看web頁面
image.png -
瀏覽器訪問 宿主機IP:8080/metrics崩侠,查看指標(biāo)數(shù)據(jù)
image.png
2.2.2 prometheus server收集cadvisor數(shù)據(jù)
2.2.2.1 cadvisor指標(biāo)數(shù)據(jù)
當(dāng)能夠正常采集到cAdvisor 的樣本數(shù)據(jù)后,可以通過以下表達(dá)式計算容器的CPU使用率:
容器CPU使用率
sum(irate(container_cpu_usage_seconds_total{imagel=""}[1m])) without(cpu)查詢?nèi)萜鲀?nèi)存使用量(單位:字節(jié))
container_memory_usage_bytes{image!=""}查詢?nèi)萜骶W(wǎng)絡(luò)接收量(速率)(單位:字節(jié)/秒)
sum(rate(container_network_receive_bytes_total{image!=""}[1m])) without (interface)容器網(wǎng)絡(luò)傳輸量字節(jié)/秒
sum(rate(container_network_transmit_bytes_total{imagel=""}[1m])) without (interface)容器文件系統(tǒng)讀取速率字節(jié)/秒
sum(rate(container_fs_reads_bytes_totalf{image!=""}[1m])) without (device)容器文件系統(tǒng)寫入速率字節(jié)/秒
sum(rate(container_fs_writes_bytes_total{image!=""}[1m])) without (device)
cadvisor常用容器監(jiān)控指標(biāo)
- 網(wǎng)絡(luò)流量
容器網(wǎng)絡(luò)接收的字節(jié)數(shù)(1分鐘內(nèi))坷檩,根據(jù)名稱查詢name=~".+"
sum(rate(container_network_receive_bytes_total{name=~".+"}[1m])) by (name)容器網(wǎng)絡(luò)傳輸?shù)淖止?jié)數(shù)(1分鐘內(nèi))却音,根據(jù)名稱查詢name=~".+"
sum(rate(container_network_transmit_bytes_total{name=~".+"}[1m])) by (name)
- 容器CPU相關(guān)
所用容器system cpu的累計使用時間(1min內(nèi))
sum(rate(container_cpu_system_seconds_total[1m]))每個容器system cpu的使用時間(1min內(nèi))
sum(irate(container_cpu_system_seconds_total{imagel=""}[1m])) without (cpu)每個容器的Cpu使用率
sum(rate(container_cpu_usage_seconds_total{name=~".+"}[1m])) by (name)*100總?cè)萜鞯腸pu使用率
sum(sum(rate(container_cpu_usage_seconds_total{name=~".+"}[1m])) by (name)*100)
2.2.2.2 添加數(shù)據(jù)采集
#修改配置文件
# vim /apps/prometheus/prometheus.yml
...
- job_name: "k8s-cadvisor"
static_configs:
- targets: ["172.20.20.101:8080","172.20.20.102:8080","172.20.20.103:8080","172.20.20.111:8080","172.20.20.112:8080","172.20.20.113:8080"]
#重啟服務(wù)
# systemctl restart prometheus.service
2.2.2.3 驗證數(shù)據(jù)采集
2.2.2.4 驗證cadvisor數(shù)據(jù)
三. 通過grafana展示prometheus的node和pod數(shù)據(jù)
3.1 安裝grafana
#下載并安裝
# apt-get install -y adduser libfontconfig1 musl
# wget https://mirrors.tuna.tsinghua.edu.cn/grafana/apt/pool/main/g/grafana-enterprise/grafana-enterprise_10.1.2_amd64.deb
# dpkg -i grafana-enterprise_10.1.2_amd64.deb
#啟動服務(wù)
# systemctl start grafana-server.service
#檢查,本次Prometheus和grafana安裝在一臺主機上
# ss -ntpl|grep grafana
LISTEN 0 4096 *:3000 *:* users:(("grafana",pid=65973,fd=11))
3.2 驗證web界面
登錄http://172.20.20.201:3000
3.3 登錄并添加數(shù)據(jù)源
選擇data sources
選擇prometheus
添加名稱和Prometheus URL地址
最后點擊下面的保存
3.4 展示監(jiān)控數(shù)據(jù)
3.4.1 展示node數(shù)據(jù)
搜索node相關(guān)模版
可以通過下面三種方式使用模版 1.導(dǎo)入下載的json文件 2.復(fù)制json文件 3.使用模版對于的ID號
使用模版
1.導(dǎo)入下載的json文件 2.復(fù)制json文件 3.使用模版對于的ID號(內(nèi)網(wǎng)環(huán)境就可以下載導(dǎo)入json文件矢炼,或復(fù)制json文件)
選擇數(shù)據(jù)源
-
展示node監(jiān)控數(shù)據(jù)
進入首頁系瓢,選擇相應(yīng)的dashboard
image.png
查看監(jiān)控數(shù)據(jù)
3.4.2 展示pod數(shù)據(jù)
按照上面的方法找到模版,并導(dǎo)入
四. 梳理prometheus服務(wù)發(fā)現(xiàn)
4.1 服務(wù)發(fā)現(xiàn)機制
prometheus默認(rèn)是采用pull方式拉取監(jiān)控數(shù)據(jù)的句灌,也就是定時去目標(biāo)主機上抓取metrics數(shù)據(jù)夷陋,每一個被抓取的目標(biāo)需要暴露一個HTTP接口欠拾,prometheus通過這個暴露的接口就可以獲取到相應(yīng)的指標(biāo)數(shù)據(jù),這種方式需要由目標(biāo)服務(wù)決定采集的目標(biāo)有哪些骗绕,通過配置在scarpe_configs中的各種job來實現(xiàn)藐窄,無法動態(tài)感知新服務(wù),如果后面增加了節(jié)點或組件信息酬土,就得手動修改prometheus配置荆忍,并重啟prometheus,很不方便撤缴,所以出現(xiàn)了動態(tài)服務(wù)發(fā)現(xiàn)刹枉,動態(tài)服務(wù)發(fā)現(xiàn)能夠自動發(fā)現(xiàn)集群中的新端點,并加入到配置中屈呕,通過服務(wù)發(fā)現(xiàn)微宝,prometheus能查詢到需要監(jiān)控的target列表,然后輪詢這些target獲取監(jiān)控數(shù)據(jù)虎眨。
prometheus獲取數(shù)據(jù)源target的方式有多種芥吟,如靜態(tài)配置和動態(tài)服務(wù)發(fā)現(xiàn)配置,prometheus目前支持的服務(wù)發(fā)現(xiàn)有很多種专甩,常用的主要分為以下幾種:
kubernetes_sd_configs: #基于 Kubernetes API實現(xiàn)的服務(wù)發(fā)現(xiàn)钟鸵,讓prometheus動態(tài)發(fā)現(xiàn)kubernetes中被監(jiān)控的目標(biāo)
static_configs: #靜態(tài)服務(wù)發(fā)現(xiàn),基于prometheus配置文件指定的監(jiān)控目標(biāo)
dns_sd_configs: #DNS服務(wù)發(fā)現(xiàn)監(jiān)控目標(biāo)
consul_sd_configs: #Consul服務(wù)發(fā)現(xiàn)涤躲,基于consul服務(wù)動態(tài)發(fā)現(xiàn)監(jiān)控目標(biāo)
file_sd_configs: #基于指定的文件實現(xiàn)服務(wù)發(fā)現(xiàn)棺耍,基于指定的文件發(fā)現(xiàn)監(jiān)控目標(biāo)
promethues的靜態(tài)服務(wù)發(fā)現(xiàn)static_configs:每當(dāng)有一個新的目標(biāo)實例需要監(jiān)控,都需要手動修改配置文件配置目標(biāo)target种樱。
promethues的consul服務(wù)發(fā)現(xiàn)consul_sd_configs:Prometheus一直監(jiān)視consul服務(wù)蒙袍,當(dāng)發(fā)現(xiàn)在consul中注冊的服務(wù)有變化,prometheus就會自動監(jiān)控到所有注冊到 consul中的目標(biāo)資源
promethues的k8s服務(wù)發(fā)現(xiàn)kubernetes_sd_configs:Prometheus與Kubernetes的API進行交互嫩挤,動態(tài)的發(fā)現(xiàn)Kubernetes中部署的所有可監(jiān)控的目標(biāo)資源害幅。
4.2 標(biāo)簽重寫(relabeling)
prometheus的relabeling能夠在抓取到目標(biāo)實例之前把目標(biāo)實例的元數(shù)據(jù)標(biāo)簽動態(tài)重新修改,動態(tài)添加或者覆蓋標(biāo)簽岂昭。
prometheus從kubernetes API動態(tài)發(fā)現(xiàn)target之后以现,在被發(fā)現(xiàn)的target實例中,都包含一些原始的Metadata標(biāo)簽信息约啊,默認(rèn)標(biāo)簽有:
__address__: 以<host>:<port>格式顯示targets地址
__scheme__: 采集的目標(biāo)服務(wù)地址的scheme形式邑遏,HTTP或HTTPS
__metrics_path__:采集的目標(biāo)服務(wù)訪問路徑
4.2.1 重寫目的
為了更好的識別監(jiān)控指標(biāo),便于后期調(diào)用數(shù)據(jù)繪圖、告警等需求恰矩,prometheus 支持對發(fā)現(xiàn)的目標(biāo)進行 label 修改记盒,在兩個階段可以重新標(biāo)記:
relabel_configs:在對target進行數(shù)據(jù)采集之前(比如在采集數(shù)據(jù)之前重新定義標(biāo)簽信息,如目的IP外傅、目的端口等信息)纪吮,可以使用relabel_configs添加俩檬、修改或刪除一些標(biāo)簽、也可以只采集特定目標(biāo)或過濾目標(biāo)碾盟。
metric_relabel_configs:在對 target 進行數(shù)據(jù)采集之后棚辽,即如果是已經(jīng)抓取到指標(biāo)數(shù)據(jù)時,可以使用 metric_relabel_configs 做最后的重新標(biāo)記和過濾巷疼。(使用較少)
4.2.2 label
source_label
源標(biāo)簽晚胡,沒有經(jīng)過relabel處理之前標(biāo)簽的名稱
target_label
通過action處理之后新的標(biāo)簽名稱
regex
給定的值或正則表達(dá)式匹配灵奖,匹配源標(biāo)簽的值
replacement
通過分組替換后標(biāo)簽(target_label)對應(yīng)的/()/() $1:$2
4.2.3 action
replace
替換標(biāo)簽值嚼沿,根據(jù)regex正則匹配到源標(biāo)簽的值,使用replacement來引用表達(dá)式匹配的分組keep
滿足regex正則條件的實例進行采集瓷患,把source_labels中沒有匹配到regex正則內(nèi)容的target實例丟掉骡尽,即只采集匹配成功的實例drop
滿足regex正則條件的實例不采集,把source_labels中匹配到的regex正則內(nèi)容的target實例丟掉擅编,即只采集沒有匹配成功的實例hashmod
使用hashmod計算source_labels的Hash值并進行對比攀细,基于自定義的模數(shù)取模,以實現(xiàn)對目標(biāo)進行分類爱态、重新賦值等功能
scrape_configs:
- job_name: ip_job
relabel_configs:
- source_labels: [__address__]
modulus: 4
target_label: __ip_hash
action: hashmod
- source_labels: [__ip_hash]
regex: ^1$
action: keep
labelmap
匹配regex所有標(biāo)簽名稱谭贪,然后復(fù)制匹配標(biāo)簽的值進行分組,可以通過replacement分組引用(${1},${2},...)
替代labelkeep
匹配regex所有標(biāo)簽名稱锦担,其他不匹配的標(biāo)簽都將從標(biāo)簽集中刪除labeldrop
匹配regex所有標(biāo)簽名稱俭识,其他匹配的標(biāo)簽都將從標(biāo)簽集中刪除
4.2.4 支持的發(fā)現(xiàn)目標(biāo)類型
發(fā)現(xiàn)類型可以配置以下類型之一來發(fā)現(xiàn)目標(biāo):
node #node 節(jié)點
service #發(fā)現(xiàn) service
pod #發(fā)現(xiàn) pod
endpoints #基于 svc 發(fā)現(xiàn) endpoints(pod)
Endpointslice #對 endpoint 進行切片
ingress #發(fā)現(xiàn) ingress
4.2.5 api-server的服務(wù)發(fā)現(xiàn)
apiserver 作為 Kubernetes 最核心的組件,它的監(jiān)控也是非常有必要的洞渔,對于apiserver 的監(jiān)控套媚,可以直接通過 kubernetes 的 service 來獲取
prometheus服務(wù)發(fā)現(xiàn) apiserver 配置
- job_name: 'kubernetes-apiserver'
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
#含義為匹配 default 的 namespace,svc 名稱是 kubernetes磁椒,協(xié)議是 https堤瘤,匹配成功后進行保留,并且把 regex 作為 source_labels 相對應(yīng)的值浆熔。即 labels 為 key本辐, regex 為value。
#label 替換方式如下:
#__meta_kubernetes_namespace=default
#__meta_kubernetes_service_name=kubernetes
#__meta_kubernetes_endpoint_port_name=https
#最終医增,匹配到 api-server 的地址
4.2.6 api-server 指標(biāo)數(shù)據(jù)
Apiserver 組件是 k8s 集群的入口师郑,所有請求都是從 apiserver 進來的,所以對 apiserver 指標(biāo)做監(jiān)控可以用來判斷集群的健康狀況调窍。
4.2.6.1 apiserver_request_total
apiserver_request_total 請求各個服務(wù)的訪問詳細(xì)統(tǒng)計
#以下 promQL 語句為查詢 apiserver 最近十分鐘不同方法的請求數(shù)量統(tǒng)計:
sum(rate(apiserver_request_total[10m])) by (resource,subresource,verb)
4.2.6.2 關(guān)于annotation_prometheus_io_scrape
在k8s中宝冕,基于prometheus的發(fā)現(xiàn)規(guī)則,需要在被發(fā)現(xiàn)的目的target定義注解匹配annotation_prometheus_io_scrape=true邓萨,且必須匹配成功該注解才會保留監(jiān)控target地梨,然后再進行數(shù)據(jù)抓取并進行標(biāo)簽替換菊卷,如annotation_prometheus_io_scheme標(biāo)簽為http或https
- job_name: 'kubernetes-service-endpoints' #job 名稱
kubernetes_sd_configs: #sd_configs 發(fā)現(xiàn)
- role: endpoints #角色,endpoints 發(fā)現(xiàn)
relabel_configs: #標(biāo)簽重寫配置
#annotation_prometheus_io_scrape 的值為 true宝剖,保留標(biāo)簽然后再向下執(zhí)行
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
action: keep
regex: true
#將__meta_kubernetes_service_annotation_prometheus_io_scheme 修改為__scheme__
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?) #正則匹配協(xié)議 http 或 https(洁闰?匹配全面的字符 0 次或一次),即其它協(xié)議不替換
#將__meta_kubernetes_service_annotation_prometheus_io_path 替換為__metrics_path__
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+) #路徑為為 1 到任意長度(.為匹配除\n 之外的任意單個字符万细,+ 為匹配一次或多次)
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2 #格式為地址:端口
#匹配 regex 所匹配的標(biāo)簽,然后進行應(yīng)用:
- action: labelmap
regex: __meta_kubernetes_service_label_(.+) #通過正則匹配名稱
#將__meta_kubernetes_namespace 替換為 kubernetes_namespace
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
#將__meta_kubernetes_service_name 替換為 kubernetes_name
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_service_name
4.3 服務(wù)發(fā)現(xiàn)類型
prometheus獲取數(shù)據(jù)源target的方式有多種扑眉,如靜態(tài)配置和動態(tài)服務(wù)發(fā)現(xiàn)配置,prometheus目前支持的服務(wù)發(fā)現(xiàn)有多種赖钞,常用發(fā)現(xiàn)方式的主要分為以下幾種:
靜態(tài)服務(wù)發(fā)現(xiàn)腰素、基于文件的服務(wù)發(fā)現(xiàn)、DNS服務(wù)發(fā)現(xiàn)雪营、Consul服務(wù)發(fā)現(xiàn)弓千、基于kubernetes API服務(wù)發(fā)現(xiàn)。
4.3.1 靜態(tài)服務(wù)發(fā)現(xiàn)
靜態(tài)服務(wù)發(fā)現(xiàn)献起,基于prometheus配置文件指定的監(jiān)控目標(biāo)洋访,每當(dāng)有一個新的目標(biāo)實例需要監(jiān)控,都需要手動修改配置文件谴餐,配置目標(biāo)target
scrape_configs:
- job_name: "k8s-cadvisor" # job名稱
# metrics_path: "/metrics" # 默認(rèn)URI
# scheme: http # 默認(rèn)協(xié)議
static_configs: # 靜態(tài)服務(wù)配置
- targets: ["172.20.20.101:8080","172.20.20.102:8080","172.20.20.103:8080","172.20.20.111:8080","172.20.20.112:8080","172.20.20.113:8080"] # 目標(biāo)端點地址
4.3.2 基于文件的服務(wù)發(fā)現(xiàn)
基于指定的文件實現(xiàn)服務(wù)發(fā)現(xiàn)姻政,發(fā)現(xiàn)監(jiān)控目標(biāo)
scrape_configs:
# 基于文件服務(wù)發(fā)現(xiàn)監(jiān)控配置
- job_name: 'file_sd_test'
scrape_interval: 10s # 數(shù)據(jù)采集間隔時間
file_sd_configs:
- files: # 支持yaml和json格式文件
- /data/prometheus/*.yml
refresh_interval: 10s # 重新讀取文件的刷新時間
4.3.3 DNS服務(wù)發(fā)現(xiàn)
基于DNS的服務(wù)發(fā)現(xiàn)允許配置指定一組的DNS域名,這些域名會定期查詢以發(fā)現(xiàn)目標(biāo)列表岂嗓,域名需要可以被配置的DNS服務(wù)器解析為IP汁展。
此服務(wù)發(fā)現(xiàn)方式僅支持基本的DNS A、AAAA和SRV記錄查詢摄闸。
- A記錄: 域名解析為一個IPv4地址
# vim /etc/hosts
172.20.20.202 node1.example.com
172.20.20.202 node2.example.com
# vim /apps/prometheus/prometheus.yml
- job_name: 'dns-server-name-monitor'
metrics_path: "/metrics"
dns_sd_configs:
- names: ["node1.example.com", "node2.example.com"] #指定target監(jiān)控節(jié)點
type: A #類型為A記錄
port: 9100 #指定target node-exporter端口號
refresh_interval: 15s #間隔15s刷新發(fā)現(xiàn)服務(wù)善镰,默認(rèn)30s
- AAAA記錄: 域名解析為一個IPv6地址
- SRV: SRV記錄了哪臺計算機提供了具體哪個服務(wù),格式為:服務(wù)名稱.協(xié)議類型.域名(如:_prometheus._tcp.node.example.com)
- job_name: 'dns-node-monitor-srv'
metrics_path: "/metrics"
dns_sd_configs:
- names: ["_prometheus._tcp.node.example.com"]
type: SRV
port: 9100
refresh_interval: 15s
4.3.4 Consul服務(wù)發(fā)現(xiàn)
Consul是一個分布式k/v數(shù)據(jù)庫年枕,常用于服務(wù)的服務(wù)注冊和發(fā)現(xiàn)炫欺。基于consul服務(wù)動態(tài)發(fā)現(xiàn)監(jiān)控目標(biāo)熏兄,prometheus一直監(jiān)控consul服務(wù)品洛,當(dāng)發(fā)現(xiàn)在consul中注冊的服務(wù)有變化,prometheus就會自動監(jiān)控到所有注冊到consul中目標(biāo)資源摩桶。
scrape_configs:
- job_name: 'consul_sd_test'
honor_labels: true #采集的數(shù)據(jù)中的標(biāo)簽和Prometheus本地標(biāo)簽發(fā)送沖突桥状,使用采集數(shù)據(jù)標(biāo)簽
metrics_path: "/metrics"
scheme: http
consul_sd_configs:
- server: 172.20.20.202:8500
services: [] # 發(fā)現(xiàn)的目標(biāo)服務(wù)名稱,空為所有服務(wù)
- server: 172.20.20.203:8500
services: []
參數(shù)說明:
honor_labels:控制prometheus如何處理已經(jīng)存在于已抓取數(shù)據(jù)中的標(biāo)簽與prometheus將附加服務(wù)器端的標(biāo)簽之間的沖突("job"和"instance"標(biāo)簽硝清,手動配置的目標(biāo)標(biāo)簽以及服務(wù)發(fā)現(xiàn)實現(xiàn)生成的標(biāo)簽)辅斟。
如果honor_labels設(shè)置為“true”,則保留已抓取數(shù)據(jù)的標(biāo)簽值并忽略沖突的prometheus服務(wù)器端標(biāo)簽來解決標(biāo)簽沖突芦拿;另外如果被采集端有標(biāo)簽但是值為空士飒,則使用prometheus本地標(biāo)簽值查邢;如果被采集端沒有此標(biāo)簽,但是prometheus配置了酵幕,那使用prometheus配置的標(biāo)簽值扰藕。
如果honor_labels設(shè)置為“false”,則通過將已抓取數(shù)據(jù)中的沖突標(biāo)簽重命名為exported_<original-label>(如expoeterd_instance芳撒,exporterd_job)然后附加服務(wù)器端標(biāo)簽來解決標(biāo)簽沖突邓深。
4.3.5 基于kubernetes API實現(xiàn)服務(wù)發(fā)現(xiàn)
基于kubernetes API實現(xiàn)服務(wù)發(fā)現(xiàn),prometheus與kubernetes的API進行交互笔刹,動態(tài)的發(fā)現(xiàn)kubernetes中部署的所有可監(jiān)控的目標(biāo)資源芥备。
scrape_configs:
- job_name: "kubernetes_sd_test"
scheme: http
kubernetes_sd_configs:
- role: node
五. 在prometheus實現(xiàn)kubernetes-apiserver及coredns服務(wù)發(fā)現(xiàn)
5.1 環(huán)境準(zhǔn)備
#創(chuàng)建configmap
# vim case3-1-prometheus-cfg.yaml
---
kind: ConfigMap
apiVersion: v1
metadata:
labels:
app: prometheus
name: prometheus-config
namespace: monitoring
data:
prometheus.yml: |
global:
scrape_interval: 15s
scrape_timeout: 10s
evaluation_interval: 1m
scrape_configs:
- job_name: 'kubernetes-node'
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: [__address__]
regex: '(.*):10250'
replacement: '${1}:9100'
target_label: __address__
action: replace
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- job_name: 'kubernetes-node-cadvisor'
kubernetes_sd_configs:
- role: node
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
- job_name: 'kubernetes-apiserver'
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
- job_name: 'kubernetes-service-endpoints'
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?)
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_service_name
# kubectl apply -f case3-1-prometheus-cfg.yaml
#創(chuàng)建監(jiān)控賬號
# kubectl create serviceaccount monitor -n monitoring
#對monitoring賬號授權(quán)
# kubectl create clusterrolebinding monitor-clusterrolebinding -n monitoring --clusterrole=cluster-admin --serviceaccount=monitoring:monitor
#創(chuàng)建deployment控制器
# vim case3-2-prometheus-deployment.yaml
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: prometheus-server
namespace: monitoring
labels:
app: prometheus
spec:
replicas: 1
selector:
matchLabels:
app: prometheus
component: server
#matchExpressions:
#- {key: app, operator: In, values: [prometheus]}
#- {key: component, operator: In, values: [server]}
template:
metadata:
labels:
app: prometheus
component: server
annotations:
prometheus.io/scrape: 'false'
spec:
nodeName: 172.20.20.113
serviceAccountName: monitor
containers:
- name: prometheus
image: prom/prometheus:v2.31.2
imagePullPolicy: IfNotPresent
command:
- prometheus
- --config.file=/etc/prometheus/prometheus.yml
- --storage.tsdb.path=/prometheus
- --storage.tsdb.retention=720h
ports:
- containerPort: 9090
protocol: TCP
volumeMounts:
- mountPath: /etc/prometheus/prometheus.yml
name: prometheus-config
subPath: prometheus.yml
- mountPath: /prometheus/
name: prometheus-storage-volume
volumes:
- name: prometheus-config
configMap:
name: prometheus-config
items:
- key: prometheus.yml
path: prometheus.yml
mode: 0644
- name: prometheus-storage-volume
hostPath:
path: /data/prometheusdata
type: Directory
# kubectl apply -f case3-2-prometheus-deployment.yaml
#查看
# kubectl get pod -n monitoring
NAME READY STATUS RESTARTS AGE
...
prometheus-server-6df6944859-7kwb4 1/1 Running 0 4m10s
#創(chuàng)建service
# vim case3-3-prometheus-svc.yaml
---
apiVersion: v1
kind: Service
metadata:
name: prometheus
namespace: monitoring
labels:
app: prometheus
spec:
type: NodePort
ports:
- port: 9090
targetPort: 9090
nodePort: 39090
protocol: TCP
selector:
app: prometheus
component: server
# kubectl apply -f case3-3-prometheus-svc.yaml
# kubectl get svc -n monitoring
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
prometheus NodePort 10.100.76.98 <none> 9090:39090/TCP 14s
5.2 實現(xiàn)kubernetes-apiserver服務(wù)發(fā)現(xiàn)
#完整配置在上面的創(chuàng)建configmap里,這里是kubernetes-apiserver服務(wù)發(fā)現(xiàn)相關(guān)配置
- job_name: 'kubernetes-apiserver'
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
5.3 實現(xiàn)coredns服務(wù)發(fā)現(xiàn)
#相關(guān)配置
- job_name: 'kubernetes-service-endpoints'
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?)
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_service_name