监控策略

Nightingale因为内置了服务树这种机器分组机制,和Open-Falcon相比,告警灵活性是一个质的提升

监控策略与Open-Falcon的配置有很大区别。首先取消了策略模板的机制,每一条策略都可以单独配置告警接收人,其次,策略可以直接绑定到服务树节点上,节点下的所有机器都会继承生效,另外还增加了一些字段,下面挨个字段解释:

  • 策略名称:描述这条策略的作用,比如“CPU利用率超过85%”
  • 生效节点:关联的服务树节点,节点下所有机器都会应用这条策略
  • 排除节点:生效节点下面的部分子节点可能较为特殊需要排除,可以用此配置解决
  • 报警级别:分三级,P1最严重,报警之后事件通过所有报警通道推送,P3不严重,只用部分通道
  • 统计周期:判断报警的时候使用最近多长时间以内的数据
  • 触发条件:支持与条件,即两个条件都满足才报警
  • Tag过滤:可以配置只生效监控指标的部分tag,或者排除部分tag,比如disk.io.util只监控sda
  • 执行动作:配置报警收敛策略和报警接收人,也支持配置回调,与自动化逻辑打通
  • 留观时长:告警恢复后持续观察多少秒,称为留观时长,未再触发阈值才发送恢复通知
  • 静默恢复:即只发送告警消息,不发送恢复通知,默认会发送,即不开启静默恢复
  • 生效时间:即策略生效时间,默认7*24生效,可以配置只生效部分时间段

策略配置页面支持导入,这里整理了一些常见策略,可以一键导入,然后批量修改一下报警接收人就可以用起来了 :-)

  1. [
  2. {
  3. "name": "内存利用率大于75%",
  4. "category": 1,
  5. "alert_dur": 60,
  6. "recovery_dur": 0,
  7. "recovery_notify": 1,
  8. "enable_stime": "00:00",
  9. "enable_etime": "23:59",
  10. "priority": 2,
  11. "exprs": [
  12. {
  13. "eopt": ">",
  14. "func": "all",
  15. "metric": "mem.bytes.used.percent",
  16. "params": [],
  17. "threshold": 75
  18. }
  19. ],
  20. "tags": [],
  21. "enable_days_of_week": [
  22. 0,
  23. 1,
  24. 2,
  25. 3,
  26. 4,
  27. 5,
  28. 6
  29. ],
  30. "converge": [
  31. 36000,
  32. 1
  33. ],
  34. "endpoints": null
  35. },
  36. {
  37. "name": "机器loadavg大于16",
  38. "category": 1,
  39. "alert_dur": 60,
  40. "recovery_dur": 0,
  41. "recovery_notify": 1,
  42. "enable_stime": "00:00",
  43. "enable_etime": "23:59",
  44. "priority": 2,
  45. "exprs": [
  46. {
  47. "eopt": ">",
  48. "func": "all",
  49. "metric": "cpu.loadavg.1",
  50. "params": [],
  51. "threshold": 16
  52. }
  53. ],
  54. "tags": [],
  55. "enable_days_of_week": [
  56. 0,
  57. 1,
  58. 2,
  59. 3,
  60. 4,
  61. 5,
  62. 6
  63. ],
  64. "converge": [
  65. 36000,
  66. 1
  67. ],
  68. "endpoints": null
  69. },
  70. {
  71. "name": "某磁盘无法正常读写",
  72. "category": 1,
  73. "alert_dur": 60,
  74. "recovery_dur": 0,
  75. "recovery_notify": 1,
  76. "enable_stime": "00:00",
  77. "enable_etime": "23:59",
  78. "priority": 1,
  79. "exprs": [
  80. {
  81. "eopt": ">",
  82. "func": "all",
  83. "metric": "disk.rw.error",
  84. "params": [],
  85. "threshold": 0
  86. }
  87. ],
  88. "tags": [],
  89. "enable_days_of_week": [
  90. 0,
  91. 1,
  92. 2,
  93. 3,
  94. 4,
  95. 5,
  96. 6
  97. ],
  98. "converge": [
  99. 36000,
  100. 1
  101. ],
  102. "endpoints": null
  103. },
  104. {
  105. "name": "监控agent失联",
  106. "category": 1,
  107. "alert_dur": 60,
  108. "recovery_dur": 0,
  109. "recovery_notify": 1,
  110. "enable_stime": "00:00",
  111. "enable_etime": "23:59",
  112. "priority": 1,
  113. "exprs": [
  114. {
  115. "eopt": "=",
  116. "func": "nodata",
  117. "metric": "proc.agent.alive",
  118. "params": [],
  119. "threshold": 0
  120. }
  121. ],
  122. "tags": [],
  123. "enable_days_of_week": [
  124. 0,
  125. 1,
  126. 2,
  127. 3,
  128. 4,
  129. 5,
  130. 6
  131. ],
  132. "converge": [
  133. 36000,
  134. 1
  135. ],
  136. "endpoints": null
  137. },
  138. {
  139. "name": "磁盘利用率达到85%",
  140. "category": 1,
  141. "alert_dur": 60,
  142. "recovery_dur": 0,
  143. "recovery_notify": 1,
  144. "enable_stime": "00:00",
  145. "enable_etime": "23:59",
  146. "priority": 3,
  147. "exprs": [
  148. {
  149. "eopt": ">",
  150. "func": "all",
  151. "metric": "disk.bytes.used.percent",
  152. "params": [],
  153. "threshold": 85
  154. }
  155. ],
  156. "tags": [],
  157. "enable_days_of_week": [
  158. 0,
  159. 1,
  160. 2,
  161. 3,
  162. 4,
  163. 5,
  164. 6
  165. ],
  166. "converge": [
  167. 36000,
  168. 1
  169. ],
  170. "endpoints": null
  171. },
  172. {
  173. "name": "磁盘利用率达到88%",
  174. "category": 1,
  175. "alert_dur": 60,
  176. "recovery_dur": 0,
  177. "recovery_notify": 1,
  178. "enable_stime": "00:00",
  179. "enable_etime": "23:59",
  180. "priority": 2,
  181. "exprs": [
  182. {
  183. "eopt": ">",
  184. "func": "all",
  185. "metric": "disk.bytes.used.percent",
  186. "params": [],
  187. "threshold": 88
  188. }
  189. ],
  190. "tags": [],
  191. "enable_days_of_week": [
  192. 0,
  193. 1,
  194. 2,
  195. 3,
  196. 4,
  197. 5,
  198. 6
  199. ],
  200. "converge": [
  201. 36000,
  202. 1
  203. ],
  204. "endpoints": null
  205. },
  206. {
  207. "name": "磁盘利用率达到92%",
  208. "category": 1,
  209. "alert_dur": 60,
  210. "recovery_dur": 0,
  211. "recovery_notify": 1,
  212. "enable_stime": "00:00",
  213. "enable_etime": "23:59",
  214. "priority": 1,
  215. "exprs": [
  216. {
  217. "eopt": ">",
  218. "func": "all",
  219. "metric": "disk.bytes.used.percent",
  220. "params": [],
  221. "threshold": 92
  222. }
  223. ],
  224. "tags": [],
  225. "enable_days_of_week": [
  226. 0,
  227. 1,
  228. 2,
  229. 3,
  230. 4,
  231. 5,
  232. 6
  233. ],
  234. "converge": [
  235. 36000,
  236. 1
  237. ],
  238. "endpoints": null
  239. },
  240. {
  241. "name": "端口挂了",
  242. "category": 1,
  243. "alert_dur": 60,
  244. "recovery_dur": 0,
  245. "recovery_notify": 1,
  246. "enable_stime": "00:00",
  247. "enable_etime": "23:59",
  248. "priority": 2,
  249. "exprs": [
  250. {
  251. "eopt": "!=",
  252. "func": "all",
  253. "metric": "proc.port.listen",
  254. "params": [],
  255. "threshold": 1
  256. }
  257. ],
  258. "tags": [],
  259. "enable_days_of_week": [
  260. 0,
  261. 1,
  262. 2,
  263. 3,
  264. 4,
  265. 5,
  266. 6
  267. ],
  268. "converge": [
  269. 36000,
  270. 1
  271. ],
  272. "endpoints": null
  273. },
  274. {
  275. "name": "网卡入方向丢包",
  276. "category": 1,
  277. "alert_dur": 60,
  278. "recovery_dur": 0,
  279. "recovery_notify": 1,
  280. "enable_stime": "00:00",
  281. "enable_etime": "23:59",
  282. "priority": 2,
  283. "exprs": [
  284. {
  285. "eopt": ">",
  286. "func": "all",
  287. "metric": "net.in.dropped",
  288. "params": [],
  289. "threshold": 3
  290. }
  291. ],
  292. "tags": [],
  293. "enable_days_of_week": [
  294. 0,
  295. 1,
  296. 2,
  297. 3,
  298. 4,
  299. 5,
  300. 6
  301. ],
  302. "converge": [
  303. 36000,
  304. 1
  305. ],
  306. "endpoints": null
  307. },
  308. {
  309. "name": "网卡出方向丢包",
  310. "category": 1,
  311. "alert_dur": 60,
  312. "recovery_dur": 0,
  313. "recovery_notify": 1,
  314. "enable_stime": "00:00",
  315. "enable_etime": "23:59",
  316. "priority": 2,
  317. "exprs": [
  318. {
  319. "eopt": ">",
  320. "func": "all",
  321. "metric": "net.out.dropped",
  322. "params": [],
  323. "threshold": 3
  324. }
  325. ],
  326. "tags": [],
  327. "enable_days_of_week": [
  328. 0,
  329. 1,
  330. 2,
  331. 3,
  332. 4,
  333. 5,
  334. 6
  335. ],
  336. "converge": [
  337. 36000,
  338. 1
  339. ],
  340. "endpoints": null
  341. },
  342. {
  343. "name": "进程总数超过3000",
  344. "category": 1,
  345. "alert_dur": 60,
  346. "recovery_dur": 0,
  347. "recovery_notify": 1,
  348. "enable_stime": "00:00",
  349. "enable_etime": "23:59",
  350. "priority": 1,
  351. "exprs": [
  352. {
  353. "eopt": ">",
  354. "func": "all",
  355. "metric": "sys.ps.process.total",
  356. "params": [],
  357. "threshold": 3000
  358. }
  359. ],
  360. "tags": [],
  361. "enable_days_of_week": [
  362. 0,
  363. 1,
  364. 2,
  365. 3,
  366. 4,
  367. 5,
  368. 6
  369. ],
  370. "converge": [
  371. 36000,
  372. 1
  373. ],
  374. "endpoints": null
  375. },
  376. {
  377. "name": "进程挂了",
  378. "category": 1,
  379. "alert_dur": 60,
  380. "recovery_dur": 0,
  381. "recovery_notify": 1,
  382. "enable_stime": "00:00",
  383. "enable_etime": "23:59",
  384. "priority": 2,
  385. "exprs": [
  386. {
  387. "eopt": "<",
  388. "func": "all",
  389. "metric": "proc.num",
  390. "params": [],
  391. "threshold": 1
  392. }
  393. ],
  394. "tags": [],
  395. "enable_days_of_week": [
  396. 0,
  397. 1,
  398. 2,
  399. 3,
  400. 4,
  401. 5,
  402. 6
  403. ],
  404. "converge": [
  405. 36000,
  406. 1
  407. ],
  408. "endpoints": null
  409. }
  410. ]