rbac.yaml

组件使用的集群权限

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
apiVersion: v1
automountServiceAccountToken: false
kind: ServiceAccount
metadata:
name: node-exporter
namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: node-exporter
rules:
- apiGroups:
- authentication.k8s.io
resources:
- tokenreviews
verbs:
- create
- apiGroups:
- authorization.k8s.io
resources:
- subjectaccessreviews
verbs:
- create
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: node-exporter
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: node-exporter
subjects:
- kind: ServiceAccount
name: node-exporter
namespace: monitoring

node-exporter.yaml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: node-exporter
namespace: monitoring
spec:
selector:
matchLabels:
app: node-exporter
template:
metadata:
annotations:
kubectl.kubernetes.io/default-container: node-exporter
labels:
app: node-exporter
spec:
automountServiceAccountToken: true
containers:
- args:
- --web.listen-address=127.0.0.1:9100
- --path.sysfs=/host/sys
- --path.rootfs=/host/root
- --path.udev.data=/host/root/run/udev/data
- --no-collector.wifi
- --no-collector.hwmon
- --no-collector.btrfs
- --collector.filesystem.mount-points-exclude=^/(dev|proc|sys|run/k3s/containerd/.+|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/)
- --collector.netclass.ignored-devices=^(veth.*|[a-f0-9]{15})$
- --collector.netdev.device-exclude=^(veth.*|[a-f0-9]{15})$
image: quay.io/prometheus/node-exporter:v1.9.1
name: node-exporter
resources:
limits:
cpu: 250m
memory: 180Mi
requests:
cpu: 102m
memory: 180Mi
securityContext:
allowPrivilegeEscalation: false
capabilities:
add:
- SYS_TIME
drop:
- ALL
readOnlyRootFilesystem: true
volumeMounts:
- mountPath: /host/sys
mountPropagation: HostToContainer
name: sys
readOnly: true
- mountPath: /host/root
mountPropagation: HostToContainer
name: root
readOnly: true
- args:
- --secure-listen-address=[$(IP)]:9100
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305
- --upstream=http://127.0.0.1:9100/
env:
- name: IP
valueFrom:
fieldRef:
fieldPath: status.podIP
image: quay.io/brancz/kube-rbac-proxy:v0.18.1
name: kube-rbac-proxy
ports:
- containerPort: 9100
hostPort: 9100
name: https
resources:
limits:
cpu: 20m
memory: 40Mi
requests:
cpu: 10m
memory: 20Mi
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
runAsGroup: 65532
runAsNonRoot: true
runAsUser: 65532
seccompProfile:
type: RuntimeDefault
hostNetwork: true
hostPID: true
nodeSelector:
kubernetes.io/os: linux
priorityClassName: system-cluster-critical
securityContext:
runAsGroup: 65534
runAsNonRoot: true
runAsUser: 65534
serviceAccountName: node-exporter
tolerations:
- operator: Exists
volumes:
- hostPath:
path: /sys
name: sys
- hostPath:
path: /
name: root
updateStrategy:
rollingUpdate:
maxUnavailable: 10%
type: RollingUpdate

service.yaml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
apiVersion: v1
kind: Service
metadata:
labels:
app: node-exporter
name: node-exporter
namespace: monitoring
spec:
clusterIP: None
ports:
- name: https
port: 9100
targetPort: https
selector:
app: node-exporter

servicemonitor.yaml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
app: node-exporter
name: node-exporter
namespace: monitoring
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
interval: 15s
port: https
relabelings:
- action: replace
regex: (.*)
replacement: $1
sourceLabels:
- __meta_kubernetes_pod_node_name
targetLabel: instance
scheme: https
tlsConfig:
insecureSkipVerify: true
jobLabel: app
selector:
matchLabels:
app: node-exporter

示例规则

nodeExporter-prometheusRule.yaml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
prometheus: prometheus
role: alert-rules
name: node-exporter-rules
namespace: monitoring
spec:
groups:
- name: node-exporter
rules:
- alert: 文件系统空间即将耗尽
annotations:
description: $labels.device 设备上的文件系统,挂载在 $labels.mountpoint,位于 $labels.instance,仅剩下 $value% 的可用空间,并且正在迅速填满。
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
summary: 文件系统预计在未来 24 小时内耗尽空间。
expr: |
(
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 15
and
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: warning
- alert: 文件系统空间即将耗尽
annotations:
description: $labels.device 设备上的文件系统,挂载在 $labels.mountpoint,位于 $labels.instance,仅剩下 $value% 的可用空间,并且正在迅速填满。
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
summary: 文件系统预计在未来 4 小时内耗尽空间。
expr: |
(
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 10
and
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: critical
- alert: 文件系统空间不足
annotations:
description: $labels.device 设备上的文件系统,挂载在 $labels.mountpoint,位于 $labels.instance,仅剩下 $value% 的可用空间。
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
summary: 文件系统可用空间不足 5%。
expr: |
(
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 30m
labels:
severity: warning
- alert: 文件系统空间不足
annotations:
description: $labels.device 设备上的文件系统,挂载在 $labels.mountpoint,位于 $labels.instance,仅剩下 $value% 的可用空间。
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
summary: 文件系统可用空间不足 3%。
expr: |
(
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 30m
labels:
severity: critical
- alert: 文件系统 inode 即将耗尽
annotations:
description: $labels.device 设备上的文件系统,挂载在 $labels.mountpoint,位于 $labels.instance,仅剩下 $value% 的可用 inode,并且正在迅速填满。
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
summary: 文件系统预计在未来 24 小时内耗尽 inode。
expr: |
(
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 40
and
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: warning
- alert: 文件系统 inode 即将耗尽
annotations:
description: $labels.device 设备上的文件系统,挂载在 $labels.mountpoint,位于 $labels.instance,仅剩下 $value% 的可用 inode,并且正在迅速填满。
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
summary: 文件系统预计在未来 4 小时内耗尽 inode。
expr: |
(
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 20
and
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: critical
- alert: 文件系统 inode 不足
annotations:
description: $labels.device 设备上的文件系统,挂载在 $labels.mountpoint,位于 $labels.instance,仅剩下 $value% 的可用 inode。
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
summary: 文件系统可用 inode 不足 5%。
expr: |
(
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: warning
- alert: 文件系统 inode 不足
annotations:
description: $labels.device 设备上的文件系统,挂载在 $labels.mountpoint,位于 $labels.instance,仅剩下 $value% 的可用 inode。
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
summary: 文件系统可用 inode 不足 3%。
expr: |
(
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: critical
- alert: 节点网络接收错误
annotations:
description: $labels.instance 接口 $labels.device 在过去两分钟内发生了 $value 次接收错误。
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs
summary: 网络接口报告了大量接收错误。
expr: |
rate(node_network_receive_errs_total{job="node-exporter"}[2m]) / rate(node_network_receive_packets_total{job="node-exporter"}[2m]) > 0.01
for: 1h
labels:
severity: warning
- alert: 节点网络发送错误
annotations:
description: $labels.instance 接口 $labels.device 在过去两分钟内发生了 $value 次发送错误。
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs
summary: 网络接口报告了大量发送错误。
expr: |
rate(node_network_transmit_errs_total{job="node-exporter"}[2m]) / rate(node_network_transmit_packets_total{job="node-exporter"}[2m]) > 0.01
for: 1h
labels:
severity: warning
- alert: 节点 conntrack 条目使用率过高
annotations:
description: 已使用 $value% 的 conntrack 条目。
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused
summary: conntrack 条目数量接近限制。
expr: |
(node_nf_conntrack_entries{job="node-exporter"} / node_nf_conntrack_entries_limit) > 0.75
labels:
severity: warning
- alert: 节点文本文件采集器采集失败
annotations:
description: Node Exporter 文本文件采集器在 $labels.instance 上采集失败。
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror
summary: Node Exporter 文本文件采集器采集失败。
expr: |
node_textfile_scrape_error{job="node-exporter"} == 1
labels:
severity: warning
- alert: 节点时钟偏差检测
annotations:
description: $labels.instance 的时钟偏差超过 0.05 秒。请确保该主机正确配置了 NTP。
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected
summary: 检测到时钟偏差。
expr: |
(
node_timex_offset_seconds{job="node-exporter"} > 0.05
and
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
)
or
(
node_timex_offset_seconds{job="node-exporter"} < -0.05
and
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
)
for: 10m
labels:
severity: warning
- alert: 节点时钟未同步
annotations:
description: $labels.instance 的时钟未同步。请确保该主机配置了 NTP。
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
summary: 时钟未同步。
expr: |
min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
and
node_timex_maxerror_seconds{job="node-exporter"} >= 16
for: 10m
labels:
severity: warning
- alert: 节点 RAID 阵列降级
annotations:
description: RAID 阵列 $labels.device 在 $labels.instance 上处于降级状态,由于一个或多个磁盘故障。备用磁盘数量不足以自动修复问题。
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
summary: RAID 阵列降级。
expr: |
node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}) > 0
for: 15m
labels:
severity: critical
- alert: 节点 RAID 磁盘故障
annotations:
description: RAID 阵列在 $labels.instance 上至少有一个设备故障。阵列 $labels.device 需要关注并可能需要更换磁盘。
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
summary: RAID 阵列中有设备故障。
expr: |
node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} > 0
labels:
severity: warning
- alert: 节点文件描述符限制
annotations:
description: $labels.instance 的文件描述符限制目前为 $value%。
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
summary: 内核即将耗尽文件描述符限制。
expr: |
(
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70
)
for: 15m
labels:
severity: warning
- alert: 节点文件描述符限制
annotations:
description: $labels.instance 的文件描述符限制目前为 $value%。
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
summary: 内核即将耗尽文件描述符限制。
expr: |
(
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90
)
for: 15m
labels:
severity: critical
- alert: 节点 CPU 使用率过高
annotations:
description: |
$labels.instance 的 CPU 使用率在过去 15 分钟内一直高于 90%,目前为 $value%。
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodecpuhighusage
summary: CPU 使用率过高。
expr: |
sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node-exporter", mode!="idle"}[2m]))) * 100 > 90
for: 15m
labels:
severity: info
- alert: 节点系统饱和
annotations:
description: |
$labels.instance 的系统负载每核在过去 15 分钟内一直高于 2,目前为 $value。
这可能表明该实例资源饱和,可能导致实例变得无响应。
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemsaturation
summary: 系统饱和,每核负载过高。
expr: |
node_load1{job="node-exporter"}
/ count without (cpu, mode) (node_cpu_seconds_total{job="node-exporter", mode="idle"}) > 2
for: 15m
labels:
severity: warning
- alert: 节点内存重大页面错误
annotations:
description: |
$labels.instance 的内存重大页面错误发生率非常高,在过去 15 分钟内每秒发生 500 次重大页面错误,目前为 $value。
请检查该实例是否有足够的内存可用。
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememorymajorpagesfaults
summary: 内存重大页面错误发生率非常高。
expr: |
rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) > 500
for: 15m
labels:
severity: warning
- alert: 节点内存使用率过高
annotations:
description: |
$labels.instance 的内存已满,在过去 15 分钟内一直高于 90%,目前为 $value%。
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememoryhighutilization
summary: 主机内存不足。
expr: |
100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} * 100) > 90
for: 15m
labels:
severity: warning
- alert: 节点磁盘 IO 饱和
annotations:
description: |
$labels.instance 的磁盘 IO 队列(aqu-sq)在 $labels.device 上非常高,在过去 30 分钟内一直高于 10,目前为 $value。
此症状可能表明磁盘饱和。
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodediskiosaturation
summary: 磁盘 IO 队列过高。
expr: |
rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m]) > 10
for: 30m
labels:
severity: warning
- alert: 节点 systemd 服务失败
annotations:
description: systemd 服务 $labels.name 在 $labels.instance 上进入失败状态。
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemdservicefailed
summary: systemd 服务进入失败状态。
expr: |
node_systemd_unit_state{job="node-exporter", state="failed"} == 1
for: 5m
labels:
severity: warning
- alert: 节点绑定接口降级
annotations:
description: 绑定接口 $labels.master 在 $labels.instance 上由于一个或多个从设备故障而处于降级状态。
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodebondingdegraded
summary: 绑定接口降级。
expr: |
(node_bonding_slaves - node_bonding_active) != 0
for: 5m
labels:
severity: warning