使用 Prometheus 来监控 Karmada 成员集群

Prometheus 是一个 云原生计算基金会 项目,是一个系统和服务监控系统。它以给定的时间间隔从配置的目标收集指标,评估规则表达式,显示结果,并在观察到指定条件时触发警报。 本文举例说明如何使用 Prometheus 来监控 Karmada 成员集群。

启动 Karmada 集群

你只需要克隆 Karmada repo,并在 Karmada 目录下运行以下脚本。

  1. hack/local-up-karmada.sh

启动 Prometheus

  1. 创建 Prometheus 的资源对象,内容如下。

    1. apiVersion: v1
    2. kind: Namespace
    3. metadata:
    4. name: monitor
    5. labels:
    6. name: monitor
    7. ---
    8. apiVersion: rbac.authorization.k8s.io/v1
    9. kind: ClusterRole
    10. metadata:
    11. name: prometheus
    12. rules:
    13. - apiGroups: [""]
    14. resources:
    15. - nodes
    16. - nodes/proxy
    17. - services
    18. - endpoints
    19. - pods
    20. verbs: ["get", "list", "watch"]
    21. - apiGroups:
    22. - extensions
    23. resources:
    24. - ingresses
    25. verbs: ["get", "list", "watch"]
    26. - nonResourceURLs: ["/metrics"]
    27. verbs: ["get"]
    28. ---
    29. apiVersion: v1
    30. kind: ServiceAccount
    31. metadata:
    32. name: prometheus
    33. namespace: monitor
    34. ---
    35. apiVersion: rbac.authorization.k8s.io/v1
    36. kind: ClusterRoleBinding
    37. metadata:
    38. name: prometheus
    39. roleRef:
    40. apiGroup: rbac.authorization.k8s.io
    41. kind: ClusterRole
    42. name: prometheus
    43. subjects:
    44. - kind: ServiceAccount
    45. name: prometheus
    46. namespace: monitor
    47. ---
    48. apiVersion: v1
    49. kind: ConfigMap
    50. metadata:
    51. name: prometheus-config
    52. namespace: monitor
    53. data:
    54. prometheus.yml: |
    55. global:
    56. scrape_interval: 15s
    57. evaluation_interval: 15s
    58. scrape_configs:
    59. - job_name: 'kubernetes-apiservers'
    60. kubernetes_sd_configs:
    61. - role: endpoints
    62. scheme: https
    63. tls_config:
    64. ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
    65. bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
    66. relabel_configs:
    67. - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
    68. action: keep
    69. regex: default;kubernetes;https
    70. - job_name: 'kubernetes-nodes'
    71. kubernetes_sd_configs:
    72. - role: node
    73. scheme: https
    74. tls_config:
    75. ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
    76. bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
    77. relabel_configs:
    78. - action: labelmap
    79. regex: __meta_kubernetes_node_label_(.+)
    80. - target_label: __address__
    81. replacement: kubernetes.default.svc:443
    82. - source_labels: [__meta_kubernetes_node_name]
    83. regex: (.+)
    84. target_label: __metrics_path__
    85. replacement: /api/v1/nodes/${1}/proxy/metrics
    86. - job_name: 'kubernetes-cadvisor'
    87. kubernetes_sd_configs:
    88. - role: node
    89. scheme: https
    90. tls_config:
    91. ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
    92. bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
    93. relabel_configs:
    94. - action: labelmap
    95. regex: __meta_kubernetes_node_label_(.+)
    96. - target_label: __address__
    97. replacement: kubernetes.default.svc:443
    98. - source_labels: [__meta_kubernetes_node_name]
    99. regex: (.+)
    100. target_label: __metrics_path__
    101. replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
    102. - job_name: 'kubernetes-service-endpoints'
    103. kubernetes_sd_configs:
    104. - role: endpoints
    105. relabel_configs:
    106. - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
    107. action: keep
    108. regex: true
    109. - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
    110. action: replace
    111. target_label: __scheme__
    112. regex: (https?)
    113. - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
    114. action: replace
    115. target_label: __metrics_path__
    116. regex: (.+)
    117. - source_labels: [__address__,__meta_kubernetes_service_annotation_prometheus_io_port]
    118. action: replace
    119. target_label: __address__
    120. regex: ([^:]+)(?::\d+)?;(\d+)
    121. replacement: $1:$2
    122. - action: labelmap
    123. regex: __meta_kubernetes_service_label_(.+)
    124. - source_labels: [__meta_kubernetes_namespace]
    125. action: replace
    126. target_label: kubernetes_namespace
    127. - source_labels: [__meta_kubernetes_service_name]
    128. action: replace
    129. target_label: kubernetes_name
    130. - job_name: 'kubernetes-services'
    131. kubernetes_sd_configs:
    132. - role: service
    133. metrics_path: /probe
    134. params:
    135. module: [http_2xx]
    136. relabel_configs:
    137. - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe]
    138. action: keep
    139. regex: true
    140. - source_labels: [__address__]
    141. target_label: __param_target
    142. - target_label: __address__
    143. replacement: blackbox-exporter.example.com:9115
    144. - source_labels: [__param_target]
    145. target_label: instance
    146. - action: labelmap
    147. regex: __meta_kubernetes_service_label_(.+)
    148. - source_labels: [__meta_kubernetes_namespace]
    149. target_label: kubernetes_namespace
    150. - source_labels: [__meta_kubernetes_service_name]
    151. target_label: kubernetes_name
    152. - job_name: 'kubernetes-ingresses'
    153. kubernetes_sd_configs:
    154. - role: ingress
    155. relabel_configs:
    156. - source_labels: [__meta_kubernetes_ingress_annotation_prometheus_io_probe]
    157. action: keep
    158. regex: true
    159. - source_labels: [__meta_kubernetes_ingress_scheme,__address__,__meta_kubernetes_ingress_path]
    160. regex: (.+);(.+);(.+)
    161. replacement: ${1}://${2}${3}
    162. target_label: __param_target
    163. - target_label: __address__
    164. replacement: blackbox-exporter.example.com:9115
    165. - source_labels: [__param_target]
    166. target_label: instance
    167. - action: labelmap
    168. regex: __meta_kubernetes_ingress_label_(.+)
    169. - source_labels: [__meta_kubernetes_namespace]
    170. target_label: kubernetes_namespace
    171. - source_labels: [__meta_kubernetes_ingress_name]
    172. target_label: kubernetes_name
    173. - job_name: 'kubernetes-pods'
    174. kubernetes_sd_configs:
    175. - role: pod
    176. relabel_configs:
    177. - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
    178. action: keep
    179. regex: true
    180. - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
    181. action: replace
    182. target_label: __metrics_path__
    183. regex: (.+)
    184. - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
    185. action: replace
    186. regex: ([^:]+)(?::\d+)?;(\d+)
    187. replacement: $1:$2
    188. target_label: __address__
    189. - action: labelmap
    190. regex: __meta_kubernetes_pod_label_(.+)
    191. - source_labels: [__meta_kubernetes_namespace]
    192. action: replace
    193. target_label: kubernetes_namespace
    194. - source_labels: [__meta_kubernetes_pod_name]
    195. action: replace
    196. target_label: kubernetes_pod_name
    197. - job_name: kube-state-metrics
    198. static_configs:
    199. - targets: ['kube-state-metrics.monitor.svc.cluster.local:8080']
    200. ---
    201. kind: Service
    202. apiVersion: v1
    203. metadata:
    204. labels:
    205. app: prometheus
    206. name: prometheus
    207. namespace: monitor
    208. spec:
    209. type: NodePort
    210. ports:
    211. - port: 9090
    212. targetPort: 9090
    213. nodePort: 30003
    214. selector:
    215. app: prometheus
    216. ---
    217. apiVersion: apps/v1
    218. kind: Deployment
    219. metadata:
    220. labels:
    221. name: prometheus-deployment
    222. name: prometheus
    223. namespace: monitor
    224. spec:
    225. replicas: 1
    226. selector:
    227. matchLabels:
    228. app: prometheus
    229. template:
    230. metadata:
    231. labels:
    232. app: prometheus
    233. spec:
    234. containers:
    235. - image: prom/prometheus
    236. imagePullPolicy: IfNotPresent
    237. name: prometheus
    238. command:
    239. - "/bin/prometheus"
    240. args:
    241. - "--config.file=/etc/prometheus/prometheus.yml"
    242. - "--storage.tsdb.path=/home/prometheus"
    243. - "--storage.tsdb.retention=168h"
    244. - "--web.enable-lifecycle"
    245. ports:
    246. - containerPort: 9090
    247. protocol: TCP
    248. volumeMounts:
    249. - mountPath: "/home/prometheus"
    250. name: data
    251. - mountPath: "/etc/prometheus"
    252. name: config-volume
    253. resources:
    254. requests:
    255. cpu: 100m
    256. memory: 256Mi
    257. limits:
    258. cpu: 500m
    259. memory: 3180Mi
    260. serviceAccountName: prometheus
    261. securityContext:
    262. runAsUser: 0
    263. volumes:
    264. - name: data
    265. hostPath:
    266. path: "/data/prometheus/data"
    267. - name: config-volume
    268. configMap:
    269. name: prometheus-config
  2. 运行下面的命令来执行 Karmada PropagationPolicy 和 ClusterPropagationPolicy。

    1. cat <<EOF | kubectl apply -f -
    2. apiVersion: policy.karmada.io/v1alpha1
    3. kind: PropagationPolicy
    4. metadata:
    5. name: prometheus-propagation
    6. namespace: monitor
    7. spec:
    8. resourceSelectors:
    9. - apiVersion: v1
    10. kind: Namespace
    11. name: monitor
    12. - apiVersion: v1
    13. kind: ServiceAccount
    14. name: prometheus
    15. namespace: monitor
    16. - apiVersion: v1
    17. kind: ConfigMap
    18. name: prometheus-config
    19. namespace: monitor
    20. - apiVersion: v1
    21. kind: Service
    22. name: prometheus
    23. namespace: monitor
    24. - apiVersion: apps/v1
    25. kind: Deployment
    26. name: prometheus
    27. namespace: monitor
    28. placement:
    29. clusterAffinity:
    30. clusterNames:
    31. - member1
    32. - member2
    33. - member3
    34. EOF
    35. cat <<EOF | kubectl apply -f -
    36. apiVersion: policy.karmada.io/v1alpha1
    37. kind: ClusterPropagationPolicy
    38. metadata:
    39. name: prometheusrbac-propagation
    40. spec:
    41. resourceSelectors:
    42. - apiVersion: rbac.authorization.k8s.io/v1
    43. kind: ClusterRole
    44. name: prometheus
    45. - apiVersion: rbac.authorization.k8s.io/v1
    46. kind: ClusterRoleBinding
    47. name: prometheus
    48. placement:
    49. clusterAffinity:
    50. clusterNames:
    51. - member1
    52. - member2
    53. - member3
    54. EOF
  3. 使用成员集群的任何节点IP和端口号(默认为30003)进入成员集群的 Prometheus 监控页面

参考资料