监控Linux操作系统

部署了Telegraf即可采集到常见的监控指标了,Telegraf具体使用前面有章节介绍了,这里不再赘述,这里主要提供常见大盘配置和告警规则配置的JSON,便于大家快速上手。

Linux操作系统监控大盘

  1. [
  2. {
  3. "name": "Linux基本监控指标-Telegraf采集",
  4. "tags": "HOST",
  5. "configs": "{\"var\":[{\"name\":\"host\",\"definition\":\"label_values(mem_used_percent, ident)\"}]}",
  6. "chart_groups": [
  7. {
  8. "name": "Default chart group",
  9. "weight": 0,
  10. "charts": [
  11. {
  12. "configs": "{\"name\":\"整机CPU空闲率(%)\",\"QL\":[{\"PromQL\":\"cpu_usage_idle{cpu=\\\"cpu-total\\\", ident=\\\"$host\\\"}\"}],\"yplotline1\":35,\"yplotline2\":15,\"legend\":false,\"highLevelConfig\":{\"shared\":true,\"sharedSortDirection\":\"asc\",\"precision\":\"origin\",\"formatUnit\":1000},\"version\":1,\"layout\":{\"h\":2,\"w\":8,\"x\":0,\"y\":0,\"i\":\"0\"}}",
  13. "weight": 0
  14. },
  15. {
  16. "configs": "{\"name\":\"内存可用率(%)\",\"QL\":[{\"PromQL\":\"mem_available_percent{ident=\\\"$host\\\"}\"}],\"yplotline1\":30,\"yplotline2\":15,\"legend\":false,\"highLevelConfig\":{\"shared\":true,\"sharedSortDirection\":\"asc\",\"precision\":\"origin\",\"formatUnit\":1000},\"version\":1,\"layout\":{\"h\":2,\"w\":8,\"x\":8,\"y\":0,\"i\":\"1\"}}",
  17. "weight": 0
  18. },
  19. {
  20. "configs": "{\"name\":\"硬盘利用率(%)\",\"QL\":[{\"PromQL\":\"disk_used_percent{ident=\\\"$host\\\"}\"}],\"yplotline1\":87,\"yplotline2\":92,\"legend\":false,\"highLevelConfig\":{\"shared\":true,\"sharedSortDirection\":\"desc\",\"precision\":\"origin\",\"formatUnit\":1000},\"version\":1,\"layout\":{\"h\":2,\"w\":8,\"x\":16,\"y\":0,\"i\":\"2\"}}",
  21. "weight": 0
  22. },
  23. {
  24. "configs": "{\"name\":\"IO.UTIL(%)\",\"QL\":[{\"PromQL\":\"rate(diskio_io_time{ident=\\\"$host\\\"}[1m])/10\"}],\"yplotline1\":90,\"yplotline2\":null,\"legend\":false,\"highLevelConfig\":{\"shared\":true,\"sharedSortDirection\":\"desc\",\"precision\":\"origin\",\"formatUnit\":1000},\"version\":1,\"layout\":{\"h\":2,\"w\":8,\"x\":0,\"y\":2,\"i\":\"3\"}}",
  25. "weight": 0
  26. },
  27. {
  28. "configs": "{\"name\":\"网卡每分钟丢包数(个)\",\"QL\":[{\"PromQL\":\"increase(net_drop_in{ident=\\\"$host\\\"}[1m])\",\"Legend\":\"net_drop_in ident:{{ident}} interface:{{interface}}\"},{\"PromQL\":\"increase(net_drop_out{ident=\\\"$host\\\"}[1m])\",\"Legend\":\"net_drop_out ident:{{ident}} interface:{{interface}}\"}],\"yplotline1\":5,\"yplotline2\":20,\"legend\":false,\"highLevelConfig\":{\"shared\":true,\"sharedSortDirection\":\"desc\",\"precision\":\"short\",\"formatUnit\":1000},\"version\":1,\"layout\":{\"h\":2,\"w\":8,\"x\":8,\"y\":2,\"i\":\"4\"}}",
  29. "weight": 0
  30. },
  31. {
  32. "configs": "{\"name\":\"TCP_TIME_WAIT数量\",\"QL\":[{\"PromQL\":\"netstat_tcp_time_wait{ident=\\\"$host\\\"}\"}],\"yplotline1\":null,\"yplotline2\":20000,\"legend\":false,\"highLevelConfig\":{\"shared\":true,\"sharedSortDirection\":\"desc\",\"precision\":\"short\",\"formatUnit\":1000},\"version\":1,\"layout\":{\"h\":2,\"w\":8,\"x\":16,\"y\":2,\"i\":\"5\"}}",
  33. "weight": 0
  34. }
  35. ]
  36. }
  37. ]
  38. }
  39. ]

Linux操作系统常用告警规则

  1. [
  2. {
  3. "name": "有地址PING不通,请注意",
  4. "note": "",
  5. "severity": 1,
  6. "disabled": 0,
  7. "prom_for_duration": 60,
  8. "prom_ql": "ping_result_code != 0",
  9. "prom_eval_interval": 15,
  10. "enable_stime": "00:00",
  11. "enable_etime": "23:59",
  12. "enable_days_of_week": [
  13. "1",
  14. "2",
  15. "3",
  16. "4",
  17. "5",
  18. "6",
  19. "0"
  20. ],
  21. "notify_recovered": 1,
  22. "notify_channels": [
  23. "email",
  24. "dingtalk",
  25. "wecom"
  26. ],
  27. "notify_repeat_step": 60,
  28. "callbacks": [],
  29. "runbook_url": "",
  30. "append_tags": []
  31. },
  32. {
  33. "name": "有监控对象失联",
  34. "note": "",
  35. "severity": 1,
  36. "disabled": 0,
  37. "prom_for_duration": 60,
  38. "prom_ql": "target_up != 1",
  39. "prom_eval_interval": 15,
  40. "enable_stime": "00:00",
  41. "enable_etime": "23:59",
  42. "enable_days_of_week": [
  43. "1",
  44. "2",
  45. "3",
  46. "4",
  47. "5",
  48. "6",
  49. "0"
  50. ],
  51. "notify_recovered": 1,
  52. "notify_channels": [
  53. "email",
  54. "dingtalk",
  55. "wecom"
  56. ],
  57. "notify_repeat_step": 60,
  58. "callbacks": [],
  59. "runbook_url": "",
  60. "append_tags": []
  61. },
  62. {
  63. "name": "有端口探测失败,请注意",
  64. "note": "",
  65. "severity": 1,
  66. "disabled": 0,
  67. "prom_for_duration": 60,
  68. "prom_ql": "net_response_result_code != 0",
  69. "prom_eval_interval": 15,
  70. "enable_stime": "00:00",
  71. "enable_etime": "23:59",
  72. "enable_days_of_week": [
  73. "1",
  74. "2",
  75. "3",
  76. "4",
  77. "5",
  78. "6",
  79. "0"
  80. ],
  81. "notify_recovered": 1,
  82. "notify_channels": [
  83. "email",
  84. "dingtalk",
  85. "wecom"
  86. ],
  87. "notify_repeat_step": 60,
  88. "callbacks": [],
  89. "runbook_url": "",
  90. "append_tags": []
  91. },
  92. {
  93. "name": "机器负载-CPU较高,请关注",
  94. "note": "",
  95. "severity": 3,
  96. "disabled": 0,
  97. "prom_for_duration": 60,
  98. "prom_ql": "cpu_usage_idle{cpu=\"cpu-total\"} < 25",
  99. "prom_eval_interval": 15,
  100. "enable_stime": "00:00",
  101. "enable_etime": "23:59",
  102. "enable_days_of_week": [
  103. "1",
  104. "2",
  105. "3",
  106. "4",
  107. "5",
  108. "6",
  109. "0"
  110. ],
  111. "notify_recovered": 1,
  112. "notify_channels": [
  113. "email",
  114. "dingtalk",
  115. "wecom"
  116. ],
  117. "notify_repeat_step": 60,
  118. "callbacks": [],
  119. "runbook_url": "",
  120. "append_tags": []
  121. },
  122. {
  123. "name": "机器负载-内存较高,请关注",
  124. "note": "",
  125. "severity": 2,
  126. "disabled": 0,
  127. "prom_for_duration": 60,
  128. "prom_ql": "mem_available_percent < 25",
  129. "prom_eval_interval": 15,
  130. "enable_stime": "00:00",
  131. "enable_etime": "23:59",
  132. "enable_days_of_week": [
  133. "1",
  134. "2",
  135. "3",
  136. "4",
  137. "5",
  138. "6",
  139. "0"
  140. ],
  141. "notify_recovered": 1,
  142. "notify_channels": [
  143. "email",
  144. "dingtalk",
  145. "wecom"
  146. ],
  147. "notify_repeat_step": 60,
  148. "callbacks": [],
  149. "runbook_url": "",
  150. "append_tags": []
  151. },
  152. {
  153. "name": "硬盘-IO非常繁忙",
  154. "note": "",
  155. "severity": 2,
  156. "disabled": 0,
  157. "prom_for_duration": 60,
  158. "prom_ql": "rate(diskio_io_time[1m])/10 > 99",
  159. "prom_eval_interval": 15,
  160. "enable_stime": "00:00",
  161. "enable_etime": "23:59",
  162. "enable_days_of_week": [
  163. "1",
  164. "2",
  165. "3",
  166. "4",
  167. "5",
  168. "6",
  169. "0"
  170. ],
  171. "notify_recovered": 1,
  172. "notify_channels": [
  173. "email",
  174. "dingtalk",
  175. "wecom"
  176. ],
  177. "notify_repeat_step": 60,
  178. "callbacks": [],
  179. "runbook_url": "",
  180. "append_tags": []
  181. },
  182. {
  183. "name": "硬盘-预计再有4小时写满",
  184. "note": "",
  185. "severity": 1,
  186. "disabled": 0,
  187. "prom_for_duration": 60,
  188. "prom_ql": "predict_linear(disk_free[1h], 4*3600) < 0",
  189. "prom_eval_interval": 15,
  190. "enable_stime": "00:00",
  191. "enable_etime": "23:59",
  192. "enable_days_of_week": [
  193. "1",
  194. "2",
  195. "3",
  196. "4",
  197. "5",
  198. "6",
  199. "0"
  200. ],
  201. "notify_recovered": 1,
  202. "notify_channels": [
  203. "email",
  204. "dingtalk",
  205. "wecom"
  206. ],
  207. "notify_repeat_step": 60,
  208. "callbacks": [],
  209. "runbook_url": "",
  210. "append_tags": []
  211. },
  212. {
  213. "name": "网卡-入向有丢包",
  214. "note": "",
  215. "severity": 3,
  216. "disabled": 0,
  217. "prom_for_duration": 60,
  218. "prom_ql": "increase(net_drop_in[1m]) > 0",
  219. "prom_eval_interval": 15,
  220. "enable_stime": "00:00",
  221. "enable_etime": "23:59",
  222. "enable_days_of_week": [
  223. "1",
  224. "2",
  225. "3",
  226. "4",
  227. "5",
  228. "6",
  229. "0"
  230. ],
  231. "notify_recovered": 1,
  232. "notify_channels": [
  233. "email",
  234. "dingtalk",
  235. "wecom"
  236. ],
  237. "notify_repeat_step": 60,
  238. "callbacks": [],
  239. "runbook_url": "",
  240. "append_tags": []
  241. },
  242. {
  243. "name": "网卡-出向有丢包",
  244. "note": "",
  245. "severity": 3,
  246. "disabled": 0,
  247. "prom_for_duration": 60,
  248. "prom_ql": "increase(net_drop_out[1m]) > 0",
  249. "prom_eval_interval": 15,
  250. "enable_stime": "00:00",
  251. "enable_etime": "23:59",
  252. "enable_days_of_week": [
  253. "1",
  254. "2",
  255. "3",
  256. "4",
  257. "5",
  258. "6",
  259. "0"
  260. ],
  261. "notify_recovered": 1,
  262. "notify_channels": [
  263. "email",
  264. "dingtalk",
  265. "wecom"
  266. ],
  267. "notify_repeat_step": 60,
  268. "callbacks": [],
  269. "runbook_url": "",
  270. "append_tags": []
  271. },
  272. {
  273. "name": "网络连接-TME_WAIT数量超过2万",
  274. "note": "",
  275. "severity": 2,
  276. "disabled": 0,
  277. "prom_for_duration": 60,
  278. "prom_ql": "netstat_tcp_time_wait > 20000",
  279. "prom_eval_interval": 15,
  280. "enable_stime": "00:00",
  281. "enable_etime": "23:59",
  282. "enable_days_of_week": [
  283. "1",
  284. "2",
  285. "3",
  286. "4",
  287. "5",
  288. "6",
  289. "0"
  290. ],
  291. "notify_recovered": 1,
  292. "notify_channels": [
  293. "email",
  294. "dingtalk",
  295. "wecom"
  296. ],
  297. "notify_repeat_step": 60,
  298. "callbacks": [],
  299. "runbook_url": "",
  300. "append_tags": []
  301. },
  302. {
  303. "name": "进程监控-有进程数为0,某进程可能挂了",
  304. "note": "",
  305. "severity": 1,
  306. "disabled": 0,
  307. "prom_for_duration": 60,
  308. "prom_ql": "procstat_lookup_running == 0",
  309. "prom_eval_interval": 15,
  310. "enable_stime": "00:00",
  311. "enable_etime": "23:59",
  312. "enable_days_of_week": [
  313. "1",
  314. "2",
  315. "3",
  316. "4",
  317. "5",
  318. "6",
  319. "0"
  320. ],
  321. "notify_recovered": 1,
  322. "notify_channels": [
  323. "email",
  324. "dingtalk",
  325. "wecom"
  326. ],
  327. "notify_repeat_step": 60,
  328. "callbacks": [],
  329. "runbook_url": "",
  330. "append_tags": []
  331. },
  332. {
  333. "name": "进程监控-进程句柄限制过小",
  334. "note": "",
  335. "severity": 3,
  336. "disabled": 0,
  337. "prom_for_duration": 60,
  338. "prom_ql": "procstat_rlimit_num_fds_soft < 2048",
  339. "prom_eval_interval": 15,
  340. "enable_stime": "00:00",
  341. "enable_etime": "23:59",
  342. "enable_days_of_week": [
  343. "1",
  344. "2",
  345. "3",
  346. "4",
  347. "5",
  348. "6",
  349. "0"
  350. ],
  351. "notify_recovered": 1,
  352. "notify_channels": [
  353. "email",
  354. "dingtalk",
  355. "wecom"
  356. ],
  357. "notify_repeat_step": 60,
  358. "callbacks": [],
  359. "runbook_url": "",
  360. "append_tags": []
  361. },
  362. {
  363. "name": "进程监控-采集失败",
  364. "note": "",
  365. "severity": 1,
  366. "disabled": 0,
  367. "prom_for_duration": 60,
  368. "prom_ql": "procstat_lookup_result_code != 0",
  369. "prom_eval_interval": 15,
  370. "enable_stime": "00:00",
  371. "enable_etime": "23:59",
  372. "enable_days_of_week": [
  373. "1",
  374. "2",
  375. "3",
  376. "4",
  377. "5",
  378. "6",
  379. "0"
  380. ],
  381. "notify_recovered": 1,
  382. "notify_channels": [
  383. "email",
  384. "dingtalk",
  385. "wecom"
  386. ],
  387. "notify_repeat_step": 60,
  388. "callbacks": [],
  389. "runbook_url": "",
  390. "append_tags": []
  391. }
  392. ]

Grafana大盘

笔者做了一个Grafana大盘:https://grafana.com/grafana/dashboards/15365使用Telegraf做采集、Prometheus做数据源、Nightingale生成的target_up指标来标识机器是否up,欢迎试用