监控告警配置

监控告警配置

推荐使用 Prometheus监控告警配置 - 图1 (opens new window) 作为监控和性能指标信息存储方案,使用 Grafana监控告警配置 - 图2 (opens new window) 作为可视化组件进行展示。

Prometheus 配置

  1. 在 Prometheus 配置的 scrape_configs 中添加如下 job:

如果 Zadig 和 Prometheus 在同一集群中

  1. job_name: prometheus
  2. metrics_path: /api/metrics
  3. static_configs:
  4. - targets:
  5. - aslan.<部署 namespace>.svc:25000

如果 Zadig 和 Prometheus 在不同集群中

  1. job_name: admin
  2. metrics_path: /api/aslan/metrics
  3. static_configs:
  4. - targets:
  5. - <Zadig的访问域名>
  6. scheme: https
  1. 重新加载 Prometheus 配置
  2. 在 Prometheus 的 query 界面,输入 request_total 进行查询,确认有数据,证明配置成功

Grafana 配置

Dashboard 配置

  1. [可选项]在 Grafana 的 configuration - Data source 中配置上文的 Prometheus 为数据源
  2. 在 dashboards - import 中输入如下 JSON 导入 Grafana 面板
  1. {
  2. "annotations": {
  3. "list": [
  4. {
  5. "builtIn": 1,
  6. "datasource": {
  7. "type": "grafana",
  8. "uid": "-- Grafana --"
  9. },
  10. "enable": true,
  11. "hide": true,
  12. "iconColor": "rgba(0, 211, 255, 1)",
  13. "name": "Annotations & Alerts",
  14. "target": {
  15. "limit": 100,
  16. "matchAny": false,
  17. "tags": [],
  18. "type": "dashboard"
  19. },
  20. "type": "dashboard"
  21. }
  22. ]
  23. },
  24. "editable": true,
  25. "fiscalYearStartMonth": 0,
  26. "graphTooltip": 0,
  27. "id": 3,
  28. "links": [],
  29. "liveNow": false,
  30. "panels": [
  31. {
  32. "description": "",
  33. "fieldConfig": {
  34. "defaults": {
  35. "color": {
  36. "mode": "thresholds"
  37. },
  38. "mappings": [],
  39. "thresholds": {
  40. "mode": "absolute",
  41. "steps": [
  42. {
  43. "color": "green",
  44. "value": null
  45. },
  46. {
  47. "color": "red",
  48. "value": 80
  49. }
  50. ]
  51. }
  52. },
  53. "overrides": []
  54. },
  55. "gridPos": {
  56. "h": 9,
  57. "w": 6,
  58. "x": 0,
  59. "y": 0
  60. },
  61. "id": 4,
  62. "options": {
  63. "orientation": "auto",
  64. "reduceOptions": {
  65. "calcs": [
  66. "lastNotNull"
  67. ],
  68. "fields": "",
  69. "values": false
  70. },
  71. "showThresholdLabels": false,
  72. "showThresholdMarkers": true
  73. },
  74. "pluginVersion": "9.4.7",
  75. "targets": [
  76. {
  77. "editorMode": "code",
  78. "expr": "running_workflows",
  79. "legendFormat": "__auto",
  80. "range": true,
  81. "refId": "A"
  82. }
  83. ],
  84. "title": "运行中的工作流",
  85. "type": "gauge"
  86. },
  87. {
  88. "description": "",
  89. "fieldConfig": {
  90. "defaults": {
  91. "color": {
  92. "mode": "thresholds"
  93. },
  94. "mappings": [],
  95. "thresholds": {
  96. "mode": "absolute",
  97. "steps": [
  98. {
  99. "color": "green",
  100. "value": null
  101. },
  102. {
  103. "color": "red",
  104. "value": 80
  105. }
  106. ]
  107. }
  108. },
  109. "overrides": []
  110. },
  111. "gridPos": {
  112. "h": 9,
  113. "w": 6,
  114. "x": 6,
  115. "y": 0
  116. },
  117. "id": 6,
  118. "options": {
  119. "orientation": "auto",
  120. "reduceOptions": {
  121. "calcs": [
  122. "lastNotNull"
  123. ],
  124. "fields": "",
  125. "values": false
  126. },
  127. "showThresholdLabels": false,
  128. "showThresholdMarkers": true
  129. },
  130. "pluginVersion": "9.4.7",
  131. "targets": [
  132. {
  133. "editorMode": "code",
  134. "expr": "pending_workflows",
  135. "legendFormat": "__auto",
  136. "range": true,
  137. "refId": "A"
  138. }
  139. ],
  140. "title": "排队中的工作流",
  141. "type": "gauge"
  142. },
  143. {
  144. "datasource": {
  145. "type": "prometheus",
  146. "uid": "6JQZS4L4z"
  147. },
  148. "fieldConfig": {
  149. "defaults": {
  150. "color": {
  151. "mode": "thresholds"
  152. },
  153. "mappings": [],
  154. "max": 1,
  155. "min": 0,
  156. "noValue": "0",
  157. "thresholds": {
  158. "mode": "absolute",
  159. "steps": [
  160. {
  161. "color": "green",
  162. "value": null
  163. }
  164. ]
  165. },
  166. "unit": "percentunit"
  167. },
  168. "overrides": []
  169. },
  170. "gridPos": {
  171. "h": 9,
  172. "w": 12,
  173. "x": 12,
  174. "y": 0
  175. },
  176. "id": 20,
  177. "options": {
  178. "displayMode": "gradient",
  179. "minVizHeight": 10,
  180. "minVizWidth": 0,
  181. "orientation": "auto",
  182. "reduceOptions": {
  183. "calcs": [
  184. "last"
  185. ],
  186. "fields": "",
  187. "values": false
  188. },
  189. "showUnfilled": true
  190. },
  191. "pluginVersion": "9.4.7",
  192. "targets": [
  193. {
  194. "datasource": {
  195. "type": "prometheus",
  196. "uid": "6JQZS4L4z"
  197. },
  198. "editorMode": "builder",
  199. "exemplar": false,
  200. "expr": "healthy{service=~\"aslan|dind|hub-server|plutus-vendor|user|zadig-portal\"}",
  201. "format": "time_series",
  202. "instant": true,
  203. "legendFormat": "{{service}}",
  204. "range": false,
  205. "refId": "A"
  206. }
  207. ],
  208. "title": "服务健康状态",
  209. "type": "bargauge"
  210. },
  211. {
  212. "datasource": {
  213. "type": "prometheus",
  214. "uid": "6JQZS4L4z"
  215. },
  216. "fieldConfig": {
  217. "defaults": {
  218. "color": {
  219. "mode": "palette-classic"
  220. },
  221. "custom": {
  222. "hideFrom": {
  223. "legend": false,
  224. "tooltip": false,
  225. "viz": false
  226. }
  227. },
  228. "mappings": [],
  229. "max": -1,
  230. "unit": "none"
  231. },
  232. "overrides": [
  233. {
  234. "matcher": {
  235. "id": "byName",
  236. "options": "正常"
  237. },
  238. "properties": [
  239. {
  240. "id": "color",
  241. "value": {
  242. "fixedColor": "green",
  243. "mode": "fixed"
  244. }
  245. }
  246. ]
  247. },
  248. {
  249. "matcher": {
  250. "id": "byName",
  251. "options": "异常"
  252. },
  253. "properties": [
  254. {
  255. "id": "color",
  256. "value": {
  257. "fixedColor": "red",
  258. "mode": "fixed"
  259. }
  260. }
  261. ]
  262. },
  263. {
  264. "matcher": {
  265. "id": "byName",
  266. "options": "等待接入"
  267. },
  268. "properties": [
  269. {
  270. "id": "color",
  271. "value": {
  272. "fixedColor": "blue",
  273. "mode": "fixed"
  274. }
  275. }
  276. ]
  277. },
  278. {
  279. "matcher": {
  280. "id": "byName",
  281. "options": "断开连接"
  282. },
  283. "properties": [
  284. {
  285. "id": "color",
  286. "value": {
  287. "fixedColor": "yellow",
  288. "mode": "fixed"
  289. }
  290. }
  291. ]
  292. }
  293. ]
  294. },
  295. "gridPos": {
  296. "h": 9,
  297. "w": 12,
  298. "x": 0,
  299. "y": 9
  300. },
  301. "id": 22,
  302. "options": {
  303. "displayLabels": [],
  304. "legend": {
  305. "displayMode": "list",
  306. "placement": "right",
  307. "showLegend": true,
  308. "values": []
  309. },
  310. "pieType": "pie",
  311. "reduceOptions": {
  312. "calcs": [
  313. "last"
  314. ],
  315. "fields": "",
  316. "values": false
  317. },
  318. "tooltip": {
  319. "mode": "single",
  320. "sort": "none"
  321. }
  322. },
  323. "pluginVersion": "9.4.7",
  324. "targets": [
  325. {
  326. "datasource": {
  327. "type": "prometheus",
  328. "uid": "6JQZS4L4z"
  329. },
  330. "editorMode": "builder",
  331. "exemplar": false,
  332. "expr": "count(cluster) == 0",
  333. "instant": true,
  334. "legendFormat": "异常",
  335. "range": false,
  336. "refId": "A"
  337. },
  338. {
  339. "datasource": {
  340. "type": "prometheus",
  341. "uid": "6JQZS4L4z"
  342. },
  343. "editorMode": "builder",
  344. "exemplar": false,
  345. "expr": "count(cluster == 1)",
  346. "hide": false,
  347. "instant": true,
  348. "legendFormat": "等待接入",
  349. "range": false,
  350. "refId": "B"
  351. },
  352. {
  353. "datasource": {
  354. "type": "prometheus",
  355. "uid": "6JQZS4L4z"
  356. },
  357. "editorMode": "builder",
  358. "exemplar": false,
  359. "expr": "count(cluster == 2)",
  360. "hide": false,
  361. "instant": true,
  362. "legendFormat": "断开连接",
  363. "range": false,
  364. "refId": "C"
  365. },
  366. {
  367. "datasource": {
  368. "type": "prometheus",
  369. "uid": "6JQZS4L4z"
  370. },
  371. "editorMode": "builder",
  372. "exemplar": false,
  373. "expr": "count(cluster == 3)",
  374. "hide": false,
  375. "instant": true,
  376. "legendFormat": "正常",
  377. "range": false,
  378. "refId": "D"
  379. }
  380. ],
  381. "title": "集群状态",
  382. "type": "piechart"
  383. },
  384. {
  385. "datasource": {
  386. "type": "prometheus",
  387. "uid": "6JQZS4L4z"
  388. },
  389. "fieldConfig": {
  390. "defaults": {
  391. "color": {
  392. "mode": "thresholds"
  393. },
  394. "custom": {
  395. "fillOpacity": 70,
  396. "lineWidth": 1
  397. },
  398. "decimals": 0,
  399. "mappings": [
  400. {
  401. "options": {
  402. "0": {
  403. "color": "red",
  404. "index": 3,
  405. "text": "异常"
  406. },
  407. "1": {
  408. "color": "blue",
  409. "index": 2,
  410. "text": "等待接入"
  411. },
  412. "2": {
  413. "color": "yellow",
  414. "index": 1,
  415. "text": "断开连接"
  416. },
  417. "3": {
  418. "color": "green",
  419. "index": 0,
  420. "text": "正常"
  421. }
  422. },
  423. "type": "value"
  424. }
  425. ],
  426. "max": 3,
  427. "min": 0,
  428. "thresholds": {
  429. "mode": "absolute",
  430. "steps": [
  431. {
  432. "color": "green",
  433. "value": null
  434. }
  435. ]
  436. }
  437. },
  438. "overrides": []
  439. },
  440. "gridPos": {
  441. "h": 9,
  442. "w": 12,
  443. "x": 12,
  444. "y": 9
  445. },
  446. "id": 24,
  447. "options": {
  448. "colWidth": 0.9,
  449. "legend": {
  450. "displayMode": "list",
  451. "placement": "right",
  452. "showLegend": true
  453. },
  454. "rowHeight": 0.9,
  455. "showValue": "never",
  456. "tooltip": {
  457. "mode": "single",
  458. "sort": "none"
  459. }
  460. },
  461. "pluginVersion": "9.4.7",
  462. "targets": [
  463. {
  464. "datasource": {
  465. "type": "prometheus",
  466. "uid": "6JQZS4L4z"
  467. },
  468. "editorMode": "builder",
  469. "exemplar": false,
  470. "expr": "cluster",
  471. "instant": false,
  472. "interval": "",
  473. "legendFormat": "{{cluster}}",
  474. "range": true,
  475. "refId": "A"
  476. }
  477. ],
  478. "title": "集群状态详情",
  479. "type": "status-history"
  480. },
  481. {
  482. "datasource": {
  483. "type": "prometheus",
  484. "uid": "6JQZS4L4z"
  485. },
  486. "fieldConfig": {
  487. "defaults": {
  488. "color": {
  489. "mode": "palette-classic"
  490. },
  491. "custom": {
  492. "axisCenteredZero": false,
  493. "axisColorMode": "text",
  494. "axisLabel": "",
  495. "axisPlacement": "auto",
  496. "barAlignment": 0,
  497. "drawStyle": "line",
  498. "fillOpacity": 0,
  499. "gradientMode": "none",
  500. "hideFrom": {
  501. "legend": false,
  502. "tooltip": false,
  503. "viz": false
  504. },
  505. "lineInterpolation": "linear",
  506. "lineWidth": 1,
  507. "pointSize": 5,
  508. "scaleDistribution": {
  509. "type": "linear"
  510. },
  511. "showPoints": "auto",
  512. "spanNulls": false,
  513. "stacking": {
  514. "group": "A",
  515. "mode": "none"
  516. },
  517. "thresholdsStyle": {
  518. "mode": "off"
  519. }
  520. },
  521. "mappings": [],
  522. "thresholds": {
  523. "mode": "percentage",
  524. "steps": [
  525. {
  526. "color": "green",
  527. "value": null
  528. },
  529. {
  530. "color": "red",
  531. "value": 80
  532. }
  533. ]
  534. },
  535. "unit": "percentunit"
  536. },
  537. "overrides": []
  538. },
  539. "gridPos": {
  540. "h": 8,
  541. "w": 12,
  542. "x": 0,
  543. "y": 18
  544. },
  545. "id": 16,
  546. "options": {
  547. "legend": {
  548. "calcs": [],
  549. "displayMode": "list",
  550. "placement": "right",
  551. "showLegend": true
  552. },
  553. "tooltip": {
  554. "mode": "single",
  555. "sort": "none"
  556. }
  557. },
  558. "targets": [
  559. {
  560. "datasource": {
  561. "type": "prometheus",
  562. "uid": "6JQZS4L4z"
  563. },
  564. "editorMode": "builder",
  565. "expr": "cpu_percentage",
  566. "legendFormat": "{{service}}",
  567. "range": true,
  568. "refId": "A"
  569. }
  570. ],
  571. "title": "CPU占用百分比",
  572. "type": "timeseries"
  573. },
  574. {
  575. "datasource": {
  576. "type": "prometheus",
  577. "uid": "6JQZS4L4z"
  578. },
  579. "fieldConfig": {
  580. "defaults": {
  581. "color": {
  582. "mode": "palette-classic"
  583. },
  584. "custom": {
  585. "axisCenteredZero": false,
  586. "axisColorMode": "text",
  587. "axisLabel": "",
  588. "axisPlacement": "auto",
  589. "barAlignment": 0,
  590. "drawStyle": "line",
  591. "fillOpacity": 0,
  592. "gradientMode": "none",
  593. "hideFrom": {
  594. "legend": false,
  595. "tooltip": false,
  596. "viz": false
  597. },
  598. "lineInterpolation": "linear",
  599. "lineWidth": 1,
  600. "pointSize": 5,
  601. "scaleDistribution": {
  602. "type": "linear"
  603. },
  604. "showPoints": "auto",
  605. "spanNulls": false,
  606. "stacking": {
  607. "group": "A",
  608. "mode": "none"
  609. },
  610. "thresholdsStyle": {
  611. "mode": "off"
  612. }
  613. },
  614. "mappings": [],
  615. "thresholds": {
  616. "mode": "percentage",
  617. "steps": [
  618. {
  619. "color": "green",
  620. "value": null
  621. },
  622. {
  623. "color": "red",
  624. "value": 80
  625. }
  626. ]
  627. },
  628. "unit": "percentunit"
  629. },
  630. "overrides": []
  631. },
  632. "gridPos": {
  633. "h": 8,
  634. "w": 12,
  635. "x": 12,
  636. "y": 18
  637. },
  638. "id": 18,
  639. "options": {
  640. "legend": {
  641. "calcs": [],
  642. "displayMode": "list",
  643. "placement": "right",
  644. "showLegend": true
  645. },
  646. "tooltip": {
  647. "mode": "single",
  648. "sort": "none"
  649. }
  650. },
  651. "targets": [
  652. {
  653. "datasource": {
  654. "type": "prometheus",
  655. "uid": "6JQZS4L4z"
  656. },
  657. "editorMode": "builder",
  658. "expr": "memory_percentage",
  659. "legendFormat": "{{service}}",
  660. "range": true,
  661. "refId": "A"
  662. }
  663. ],
  664. "title": "内存占用百分比",
  665. "type": "timeseries"
  666. },
  667. {
  668. "datasource": {
  669. "type": "prometheus",
  670. "uid": "6JQZS4L4z"
  671. },
  672. "fieldConfig": {
  673. "defaults": {
  674. "color": {
  675. "mode": "palette-classic"
  676. },
  677. "custom": {
  678. "axisCenteredZero": false,
  679. "axisColorMode": "text",
  680. "axisLabel": "",
  681. "axisPlacement": "auto",
  682. "barAlignment": 0,
  683. "drawStyle": "line",
  684. "fillOpacity": 0,
  685. "gradientMode": "none",
  686. "hideFrom": {
  687. "legend": false,
  688. "tooltip": false,
  689. "viz": false
  690. },
  691. "lineInterpolation": "linear",
  692. "lineWidth": 1,
  693. "pointSize": 5,
  694. "scaleDistribution": {
  695. "type": "linear"
  696. },
  697. "showPoints": "auto",
  698. "spanNulls": false,
  699. "stacking": {
  700. "group": "A",
  701. "mode": "none"
  702. },
  703. "thresholdsStyle": {
  704. "mode": "off"
  705. }
  706. },
  707. "mappings": [],
  708. "thresholds": {
  709. "mode": "absolute",
  710. "steps": [
  711. {
  712. "color": "green",
  713. "value": null
  714. }
  715. ]
  716. }
  717. },
  718. "overrides": []
  719. },
  720. "gridPos": {
  721. "h": 8,
  722. "w": 12,
  723. "x": 0,
  724. "y": 26
  725. },
  726. "id": 8,
  727. "options": {
  728. "legend": {
  729. "calcs": [],
  730. "displayMode": "list",
  731. "placement": "right",
  732. "showLegend": true
  733. },
  734. "tooltip": {
  735. "mode": "single",
  736. "sort": "none"
  737. }
  738. },
  739. "targets": [
  740. {
  741. "editorMode": "code",
  742. "expr": "cpu",
  743. "legendFormat": "{{service}}",
  744. "range": true,
  745. "refId": "A"
  746. }
  747. ],
  748. "title": "CPU 消耗",
  749. "type": "timeseries"
  750. },
  751. {
  752. "datasource": {
  753. "type": "prometheus",
  754. "uid": "6JQZS4L4z"
  755. },
  756. "fieldConfig": {
  757. "defaults": {
  758. "color": {
  759. "mode": "palette-classic"
  760. },
  761. "custom": {
  762. "axisCenteredZero": false,
  763. "axisColorMode": "text",
  764. "axisLabel": "",
  765. "axisPlacement": "auto",
  766. "barAlignment": 0,
  767. "drawStyle": "line",
  768. "fillOpacity": 0,
  769. "gradientMode": "none",
  770. "hideFrom": {
  771. "legend": false,
  772. "tooltip": false,
  773. "viz": false
  774. },
  775. "lineInterpolation": "linear",
  776. "lineWidth": 1,
  777. "pointSize": 5,
  778. "scaleDistribution": {
  779. "type": "linear"
  780. },
  781. "showPoints": "auto",
  782. "spanNulls": false,
  783. "stacking": {
  784. "group": "A",
  785. "mode": "none"
  786. },
  787. "thresholdsStyle": {
  788. "mode": "off"
  789. }
  790. },
  791. "mappings": [],
  792. "thresholds": {
  793. "mode": "absolute",
  794. "steps": [
  795. {
  796. "color": "green",
  797. "value": null
  798. }
  799. ]
  800. }
  801. },
  802. "overrides": []
  803. },
  804. "gridPos": {
  805. "h": 8,
  806. "w": 12,
  807. "x": 12,
  808. "y": 26
  809. },
  810. "id": 10,
  811. "options": {
  812. "legend": {
  813. "calcs": [],
  814. "displayMode": "list",
  815. "placement": "right",
  816. "showLegend": true
  817. },
  818. "tooltip": {
  819. "mode": "single",
  820. "sort": "none"
  821. }
  822. },
  823. "targets": [
  824. {
  825. "editorMode": "code",
  826. "expr": "memory",
  827. "legendFormat": "{{service}}",
  828. "range": true,
  829. "refId": "A"
  830. }
  831. ],
  832. "title": "内存消耗(MB)",
  833. "type": "timeseries"
  834. },
  835. {
  836. "fieldConfig": {
  837. "defaults": {
  838. "color": {
  839. "mode": "palette-classic"
  840. },
  841. "custom": {
  842. "axisCenteredZero": false,
  843. "axisColorMode": "text",
  844. "axisLabel": "",
  845. "axisPlacement": "auto",
  846. "barAlignment": 0,
  847. "drawStyle": "line",
  848. "fillOpacity": 0,
  849. "gradientMode": "none",
  850. "hideFrom": {
  851. "legend": false,
  852. "tooltip": false,
  853. "viz": false
  854. },
  855. "lineInterpolation": "linear",
  856. "lineWidth": 3,
  857. "pointSize": 5,
  858. "scaleDistribution": {
  859. "type": "linear"
  860. },
  861. "showPoints": "auto",
  862. "spanNulls": false,
  863. "stacking": {
  864. "group": "A",
  865. "mode": "none"
  866. },
  867. "thresholdsStyle": {
  868. "mode": "off"
  869. }
  870. },
  871. "mappings": [],
  872. "thresholds": {
  873. "mode": "absolute",
  874. "steps": [
  875. {
  876. "color": "green",
  877. "value": null
  878. },
  879. {
  880. "color": "red",
  881. "value": 80
  882. }
  883. ]
  884. }
  885. },
  886. "overrides": []
  887. },
  888. "gridPos": {
  889. "h": 9,
  890. "w": 12,
  891. "x": 0,
  892. "y": 34
  893. },
  894. "id": 2,
  895. "options": {
  896. "legend": {
  897. "calcs": [],
  898. "displayMode": "list",
  899. "placement": "right",
  900. "showLegend": true
  901. },
  902. "tooltip": {
  903. "mode": "single",
  904. "sort": "none"
  905. }
  906. },
  907. "targets": [
  908. {
  909. "editorMode": "code",
  910. "exemplar": false,
  911. "expr": "sum by(instance) (rate(request_total{status=\"200\"}[1m]))",
  912. "format": "time_series",
  913. "hide": false,
  914. "instant": false,
  915. "interval": "",
  916. "legendFormat": "200",
  917. "range": true,
  918. "refId": "A"
  919. },
  920. {
  921. "editorMode": "code",
  922. "expr": "sum by(instance) (rate(request_total{status=~\"4..\"}[1m]))",
  923. "hide": false,
  924. "legendFormat": "4xx",
  925. "range": true,
  926. "refId": "B"
  927. },
  928. {
  929. "editorMode": "code",
  930. "expr": "sum by(instance) (rate(request_total{status=\"5..\"}[1m]))",
  931. "hide": false,
  932. "legendFormat": "5xx",
  933. "range": true,
  934. "refId": "C"
  935. }
  936. ],
  937. "title": "QPS追踪",
  938. "type": "timeseries"
  939. },
  940. {
  941. "fieldConfig": {
  942. "defaults": {
  943. "color": {
  944. "mode": "thresholds"
  945. },
  946. "custom": {
  947. "align": "auto",
  948. "cellOptions": {
  949. "type": "auto"
  950. },
  951. "inspect": false
  952. },
  953. "mappings": [],
  954. "thresholds": {
  955. "mode": "absolute",
  956. "steps": [
  957. {
  958. "color": "green",
  959. "value": null
  960. },
  961. {
  962. "color": "red",
  963. "value": 80
  964. }
  965. ]
  966. }
  967. },
  968. "overrides": [
  969. {
  970. "matcher": {
  971. "id": "byName",
  972. "options": "慢接口"
  973. },
  974. "properties": [
  975. {
  976. "id": "custom.width",
  977. "value": 1301
  978. }
  979. ]
  980. }
  981. ]
  982. },
  983. "gridPos": {
  984. "h": 9,
  985. "w": 12,
  986. "x": 12,
  987. "y": 34
  988. },
  989. "id": 12,
  990. "options": {
  991. "footer": {
  992. "countRows": false,
  993. "fields": "",
  994. "reducer": [
  995. "sum"
  996. ],
  997. "show": false
  998. },
  999. "frameIndex": 0,
  1000. "showHeader": true,
  1001. "sortBy": []
  1002. },
  1003. "pluginVersion": "9.4.7",
  1004. "targets": [
  1005. {
  1006. "editorMode": "code",
  1007. "exemplar": false,
  1008. "expr": "topk(10, sum by(method, handler) (api_response_time_bucket{status=\"200\",le=\"+Inf\"}) - sum by(method, handler) (api_response_time_bucket{status=\"200\",le=\"1.4\"}))",
  1009. "format": "time_series",
  1010. "hide": false,
  1011. "legendFormat": "__auto",
  1012. "range": true,
  1013. "refId": "A"
  1014. }
  1015. ],
  1016. "title": "慢接口列表",
  1017. "transformations": [
  1018. {
  1019. "id": "reduce",
  1020. "options": {
  1021. "includeTimeField": false,
  1022. "labelsToFields": false,
  1023. "mode": "seriesToRows",
  1024. "reducers": [
  1025. "last"
  1026. ]
  1027. }
  1028. },
  1029. {
  1030. "id": "sortBy",
  1031. "options": {
  1032. "fields": {},
  1033. "sort": [
  1034. {
  1035. "desc": true,
  1036. "field": "Last"
  1037. }
  1038. ]
  1039. }
  1040. },
  1041. {
  1042. "id": "organize",
  1043. "options": {
  1044. "excludeByName": {},
  1045. "indexByName": {},
  1046. "renameByName": {
  1047. "Field": "慢接口",
  1048. "Last": "请求数量"
  1049. }
  1050. }
  1051. }
  1052. ],
  1053. "type": "table"
  1054. }
  1055. ],
  1056. "refresh": "5s",
  1057. "revision": 1,
  1058. "schemaVersion": 38,
  1059. "style": "dark",
  1060. "tags": [],
  1061. "templating": {
  1062. "list": []
  1063. },
  1064. "time": {
  1065. "from": "now-5m",
  1066. "to": "now"
  1067. },
  1068. "timepicker": {},
  1069. "timezone": "",
  1070. "title": "Zadig 监控面板",
  1071. "uid": "w1U8k1xVkfew",
  1072. "version": 26,
  1073. "weekStart": ""
  1074. }
  1. 在面板列表中找到 Zadig 监控面板, 确认数据正常展示。

告警规则配置

  1. 导入告警规则,导入方法可参考 grafana 文档监控告警配置 - 图3 (opens new window)
  1. apiVersion: 1
  2. groups:
  3. - orgId: 1
  4. name: zadig
  5. folder: koderover
  6. interval: 10s
  7. rules:
  8. - uid: 4ugWsnEIk
  9. title: CPU占用百分比
  10. condition: C
  11. data:
  12. - refId: A
  13. relativeTimeRange:
  14. from: 300
  15. to: 0
  16. datasourceUid: 6JQZS4L4z
  17. model:
  18. datasource:
  19. type: prometheus
  20. uid: 6JQZS4L4z
  21. editorMode: builder
  22. expr: cpu_percentage{service=~"aslan|user|cron|dind|plutus-vendor|hub-server"}
  23. interval: ""
  24. intervalMs: 15000
  25. legendFormat: "{{service}}"
  26. maxDataPoints: 43200
  27. range: true
  28. refId: A
  29. - refId: B
  30. relativeTimeRange:
  31. from: 300
  32. to: 0
  33. datasourceUid: __expr__
  34. model:
  35. conditions:
  36. - evaluator:
  37. params: []
  38. type: gt
  39. operator:
  40. type: and
  41. query:
  42. params:
  43. - B
  44. reducer:
  45. params: []
  46. type: last
  47. type: query
  48. datasource:
  49. type: __expr__
  50. uid: __expr__
  51. expression: A
  52. hide: false
  53. intervalMs: 1000
  54. maxDataPoints: 43200
  55. reducer: mean
  56. refId: B
  57. type: reduce
  58. - refId: C
  59. relativeTimeRange:
  60. from: 300
  61. to: 0
  62. datasourceUid: __expr__
  63. model:
  64. conditions:
  65. - evaluator:
  66. params:
  67. - 0.8
  68. type: gt
  69. operator:
  70. type: and
  71. query:
  72. params:
  73. - C
  74. reducer:
  75. params: []
  76. type: last
  77. type: query
  78. datasource:
  79. type: __expr__
  80. uid: __expr__
  81. expression: B
  82. hide: false
  83. intervalMs: 1000
  84. maxDataPoints: 43200
  85. refId: C
  86. type: threshold
  87. dashboardUid: w1U8k1xVkfew
  88. panelId: 16
  89. noDataState: NoData
  90. execErrState: Error
  91. for: 5m
  92. annotations:
  93. __dashboardUid__: w1U8k1xVkfew
  94. __panelId__: "16"
  95. isPaused: false
  96. - uid: YIcGs7ESz
  97. title: 内存占用百分比
  98. condition: C
  99. data:
  100. - refId: A
  101. relativeTimeRange:
  102. from: 300
  103. to: 0
  104. datasourceUid: 6JQZS4L4z
  105. model:
  106. datasource:
  107. type: prometheus
  108. uid: 6JQZS4L4z
  109. editorMode: builder
  110. expr: memory_percentage{service=~"aslan|user|cron|hub-server|plutus-vendor|dind"}
  111. interval: ""
  112. intervalMs: 15000
  113. legendFormat: "{{service}}"
  114. maxDataPoints: 43200
  115. range: true
  116. refId: A
  117. - refId: B
  118. relativeTimeRange:
  119. from: 300
  120. to: 0
  121. datasourceUid: __expr__
  122. model:
  123. conditions:
  124. - evaluator:
  125. params: []
  126. type: gt
  127. operator:
  128. type: and
  129. query:
  130. params:
  131. - B
  132. reducer:
  133. params: []
  134. type: last
  135. type: query
  136. datasource:
  137. type: __expr__
  138. uid: __expr__
  139. expression: A
  140. hide: false
  141. intervalMs: 1000
  142. maxDataPoints: 43200
  143. reducer: mean
  144. refId: B
  145. type: reduce
  146. - refId: C
  147. relativeTimeRange:
  148. from: 300
  149. to: 0
  150. datasourceUid: __expr__
  151. model:
  152. conditions:
  153. - evaluator:
  154. params:
  155. - 0.8
  156. type: gt
  157. operator:
  158. type: and
  159. query:
  160. params:
  161. - C
  162. reducer:
  163. params: []
  164. type: last
  165. type: query
  166. datasource:
  167. type: __expr__
  168. uid: __expr__
  169. expression: B
  170. hide: false
  171. intervalMs: 1000
  172. maxDataPoints: 43200
  173. refId: C
  174. type: threshold
  175. dashboardUid: w1U8k1xVkfew
  176. panelId: 18
  177. noDataState: NoData
  178. execErrState: Error
  179. for: 5m
  180. annotations:
  181. __dashboardUid__: w1U8k1xVkfew
  182. __panelId__: "18"
  183. isPaused: false
  184. - uid: RCovy7PIk
  185. title: 服务健康状态
  186. condition: C
  187. data:
  188. - refId: A
  189. relativeTimeRange:
  190. from: 10
  191. to: 0
  192. datasourceUid: 6JQZS4L4z
  193. model:
  194. editorMode: builder
  195. expr: healthy{service=~"aslan|dind|plutus-vendor|user|zadig-portal|hub-server"}
  196. hide: false
  197. intervalMs: 1000
  198. legendFormat: __auto
  199. maxDataPoints: 43200
  200. range: true
  201. refId: A
  202. - refId: B
  203. relativeTimeRange:
  204. from: 10
  205. to: 0
  206. datasourceUid: __expr__
  207. model:
  208. conditions:
  209. - evaluator:
  210. params: []
  211. type: gt
  212. operator:
  213. type: and
  214. query:
  215. params:
  216. - B
  217. reducer:
  218. params: []
  219. type: last
  220. type: query
  221. datasource:
  222. type: __expr__
  223. uid: __expr__
  224. expression: A
  225. hide: false
  226. intervalMs: 1000
  227. maxDataPoints: 43200
  228. reducer: last
  229. refId: B
  230. type: reduce
  231. - refId: C
  232. relativeTimeRange:
  233. from: 10
  234. to: 0
  235. datasourceUid: __expr__
  236. model:
  237. conditions:
  238. - evaluator:
  239. params:
  240. - 1
  241. type: lt
  242. operator:
  243. type: and
  244. query:
  245. params:
  246. - C
  247. reducer:
  248. params: []
  249. type: last
  250. type: query
  251. datasource:
  252. type: __expr__
  253. uid: __expr__
  254. expression: B
  255. hide: false
  256. intervalMs: 1000
  257. maxDataPoints: 43200
  258. refId: C
  259. type: threshold
  260. noDataState: NoData
  261. execErrState: Error
  262. for: 10s
  263. isPaused: false
  264. - uid: h6yZC8yIz
  265. title: 状态码5xx告警
  266. condition: C
  267. data:
  268. - refId: A
  269. relativeTimeRange:
  270. from: 300
  271. to: 0
  272. datasourceUid: 6JQZS4L4z
  273. model:
  274. editorMode: builder
  275. expr: sum(api_response_time_bucket{status=~"5..", le="300"})
  276. hide: false
  277. intervalMs: 1000
  278. legendFormat: __auto
  279. maxDataPoints: 43200
  280. range: true
  281. refId: A
  282. - refId: B
  283. relativeTimeRange:
  284. from: 300
  285. to: 0
  286. datasourceUid: __expr__
  287. model:
  288. conditions:
  289. - evaluator:
  290. params:
  291. - 0
  292. - 0
  293. type: gt
  294. operator:
  295. type: and
  296. query:
  297. params: []
  298. reducer:
  299. params: []
  300. type: avg
  301. type: query
  302. datasource:
  303. name: Expression
  304. type: __expr__
  305. uid: __expr__
  306. expression: A
  307. intervalMs: 1000
  308. maxDataPoints: 43200
  309. reducer: mean
  310. refId: B
  311. settings:
  312. mode: ""
  313. type: reduce
  314. - refId: C
  315. relativeTimeRange:
  316. from: 300
  317. to: 0
  318. datasourceUid: __expr__
  319. model:
  320. conditions:
  321. - evaluator:
  322. params:
  323. - 10
  324. - 0
  325. type: gt
  326. operator:
  327. type: and
  328. query:
  329. params: []
  330. reducer:
  331. params: []
  332. type: avg
  333. type: query
  334. datasource:
  335. name: Expression
  336. type: __expr__
  337. uid: __expr__
  338. expression: B
  339. intervalMs: 1000
  340. maxDataPoints: 43200
  341. refId: C
  342. type: threshold
  343. noDataState: NoData
  344. execErrState: Error
  345. for: 5m
  346. isPaused: false
  347. - uid: fA_BZusIz
  348. title: 请求返回时间P95
  349. condition: C
  350. data:
  351. - refId: A
  352. relativeTimeRange:
  353. from: 600
  354. to: 0
  355. datasourceUid: 6JQZS4L4z
  356. model:
  357. editorMode: builder
  358. expr: histogram_quantile(0.95, sum by(le) (increase(api_response_time_bucket[5m])))
  359. hide: false
  360. intervalMs: 1000
  361. legendFormat: __auto
  362. maxDataPoints: 43200
  363. range: true
  364. refId: A
  365. - refId: B
  366. datasourceUid: __expr__
  367. model:
  368. conditions:
  369. - evaluator:
  370. params:
  371. - 0
  372. - 0
  373. type: gt
  374. operator:
  375. type: and
  376. query:
  377. params: []
  378. reducer:
  379. params: []
  380. type: avg
  381. type: query
  382. datasource:
  383. name: Expression
  384. type: __expr__
  385. uid: __expr__
  386. expression: A
  387. intervalMs: 1000
  388. maxDataPoints: 43200
  389. reducer: mean
  390. refId: B
  391. type: reduce
  392. - refId: C
  393. datasourceUid: __expr__
  394. model:
  395. conditions:
  396. - evaluator:
  397. params:
  398. - 2
  399. - 0
  400. type: gt
  401. operator:
  402. type: and
  403. query:
  404. params: []
  405. reducer:
  406. params: []
  407. type: avg
  408. type: query
  409. datasource:
  410. name: Expression
  411. type: __expr__
  412. uid: __expr__
  413. expression: B
  414. intervalMs: 1000
  415. maxDataPoints: 43200
  416. refId: C
  417. type: threshold
  418. noDataState: NoData
  419. execErrState: Error
  420. for: 5m
  421. isPaused: false
  1. 在告警规则列表中找到 Zadig 的告警规则, 确认数据正常。