本节展示告警规则的示例文件。可参照示例,自定义事件和审计的告警规则,然后创建告警规则

事件告警规则示例

  1. apiVersion: logging.whizard.io/v1alpha1
  2. kind: ClusterRuleGroup
  3. metadata:
  4. name: events-rules
  5. spec:
  6. type: events
  7. rules:
  8. - name: ContainerCreated
  9. expr:
  10. kind: rule
  11. condition: type="Normal" and involvedObject.kind="Pod" and reason="Created" and involvedObject.fieldPath != ""
  12. desc: create new container
  13. enable: true
  14. alerts:
  15. severity: info
  16. - name: ContainerStarted
  17. expr:
  18. kind: rule
  19. condition: type="Normal" and involvedObject.kind="Pod" and reason="Started" and involvedObject.fieldPath != ""
  20. desc: start new pod
  21. alerts:
  22. severity: warning
  23. annotations:
  24. summary: start new pod
  25. summaryCn: 创建新容器
  26. enable: true
  27. - name: ContainerFailed
  28. expr:
  29. kind: rule
  30. condition: type="Warning" and involvedObject.kind="Pod" and reason="Failed" and involvedObject.fieldPath != ""
  31. desc: Create container failed
  32. enable: true
  33. alerts:
  34. severity: warning
  35. annotations:
  36. summary: Container failed
  37. summaryCn: 容器失败
  38. - name: ContainerKilling
  39. expr:
  40. kind: rule
  41. condition: type="Normal" and involvedObject.kind="Pod" and reason="Killing" and involvedObject.fieldPath != ""
  42. desc: container kill
  43. enable: true
  44. alerts:
  45. severity: warning
  46. annotations:
  47. summary: container killing
  48. summaryCn: 容器停止
  49. - name: ContainerPreempting
  50. expr:
  51. kind: rule
  52. condition: type="Warning" and involvedObject.kind="Pod" and reason="Preempting"
  53. desc: container is preempting
  54. alerts:
  55. severity: warning
  56. annotations:
  57. summary: Container is preemting
  58. summaryCn: 容器抢占中
  59. enable: true
  60. - name: ContainerBackoff
  61. expr:
  62. kind: rule
  63. condition: type="Warning" and involvedObject.kind="Pod" and reason="BackOff" and involvedObject.fieldPath != "" and count>3
  64. desc: container back off
  65. enable: true
  66. alerts:
  67. severity: warning
  68. annotations:
  69. summary: Container back-off
  70. summaryCn: 容器回退
  71. - name: ContainerUnhealthy
  72. expr:
  73. kind: rule
  74. condition: type="Warning" and involvedObject.kind="Pod" and reason="Unhealthy" and count>3
  75. desc: container is unhealthy
  76. alerts:
  77. severity: warning
  78. annotations:
  79. summary: Container is unhealthy
  80. summaryCn: 容器状态不良
  81. enable: true
  82. - name: ContainerProbeWarning
  83. expr:
  84. kind: rule
  85. condition: type="Warning" and involvedObject.kind="Pod" and reason="ProbeWarning" and count>3
  86. desc: Warning to perform a probe to the container
  87. alerts:
  88. severity: warning
  89. annotations:
  90. summary: Warning to perform a probe to the container
  91. summaryCn: 容器探测警告
  92. enable: true
  93. - name: PodKillingExceededGracePeriod
  94. expr:
  95. kind: rule
  96. condition: type="Warning" and involvedObject.kind="Pod" and reason="ExceededGracePeriod"
  97. desc: Pod killing exceeded specified grace period
  98. enable: true
  99. alerts:
  100. severity: warning
  101. annotations:
  102. summary: Pod killing exceeded specified grace period
  103. summaryCn: pod终止超时
  104. - name: PodKillFailed
  105. expr:
  106. kind: rule
  107. condition: type="Warning" and reason="FailedKillPod"
  108. desc: Warning to perform a probe to the container
  109. enable: true
  110. alerts:
  111. severity: warning
  112. annotations:
  113. summary: Failed to kill pod
  114. summaryCn: pod终止失败
  115. - name: PodContainerCreateFailed
  116. expr:
  117. kind: rule
  118. condition: type="Warning" and reason="FailedCreatePodContainer"
  119. desc: Failed to create pod container
  120. enable: true
  121. alerts:
  122. severity: warning
  123. annotations:
  124. summary: Failed to create pod container
  125. summaryCn: pod容器创建失败
  126. - name: PodFailed
  127. expr:
  128. kind: rule
  129. condition: type="Warning" and involvedObject.kind="Pod" and reason="Failed" and involvedObject.fieldPath=""
  130. desc: Pod failed
  131. enable: true
  132. alerts:
  133. severity: warning
  134. annotations:
  135. summary: Pod failed
  136. summaryCn: pod失败
  137. - name: PodNetworkNotReady
  138. expr:
  139. kind: rule
  140. condition: type="Warning" and involvedObject.kind="Pod" and reason="NetworkNotReady"
  141. desc: Pod network is not ready
  142. enable: true
  143. alerts:
  144. severity: warning
  145. annotations:
  146. summary: Pod network is not ready
  147. summaryCn: Pod网络异常
  148. - name: ImagePulling
  149. expr:
  150. kind: rule
  151. condition: type="Normal" and involvedObject.kind="Pod" and reason="Pulling"
  152. desc: pull images
  153. enable: true
  154. alerts:
  155. severity: warning
  156. - name: ImagePulled
  157. expr:
  158. kind: rule
  159. condition: type="Normal" and involvedObject.kind="Pod" and reason="Pulled"
  160. desc: images pulled
  161. enable: true
  162. alerts:
  163. severity: warning
  164. - name: ImagePullPolicyError
  165. expr:
  166. kind: rule
  167. condition: type="Warning" and involvedObject.kind="Pod" and reason="ErrImageNeverPull"
  168. desc: Wrong image pull policy
  169. enable: true
  170. alerts:
  171. severity: warning
  172. annotations:
  173. summary: Wrong image pull policy
  174. summaryCn: 镜像拉取策略错误
  175. - name: ImageInspectFailed
  176. expr:
  177. kind: rule
  178. condition: type="Warning" and involvedObject.kind="Pod" and reason="InspectFailed"
  179. desc: Failed to inspect image
  180. enable: true
  181. alerts:
  182. severity: warning
  183. annotations:
  184. summary: Failed to inspect image
  185. summaryCn: 镜像检查失败
  186. - name: NodeReady
  187. expr:
  188. kind: rule
  189. condition: type="Normal" and involvedObject.kind="Node" and reason="NodeReady"
  190. desc: Pod network is not ready
  191. enable: true
  192. alerts:
  193. severity: warning
  194. - name: NodeSchedulable
  195. expr:
  196. kind: rule
  197. condition: type="Normal" and involvedObject.kind="Node" and reason="NodeSchedulable"
  198. desc: node is schedulable
  199. enable: true
  200. alerts:
  201. severity: warning
  202. - name: NodeNotSchedulable
  203. expr:
  204. kind: rule
  205. condition: type="Normal" and involvedObject.kind="Node" and reason="NodeNotSchedulable"
  206. desc: node is not schedulable
  207. enable: true
  208. alerts:
  209. severity: warning
  210. - name: KubeletStarting
  211. expr:
  212. kind: rule
  213. condition: type="Normal" and involvedObject.kind="Node" and reason="Starting"
  214. desc: kubelet is starting
  215. enable: true
  216. alerts:
  217. severity: warning
  218. - name: KubeletSetupFailed
  219. expr:
  220. kind: rule
  221. condition: type="Warning" and involvedObject.kind="Node" and reason="KubeletSetupFailed"
  222. desc: Failed to setup kubelet
  223. enable: true
  224. alerts:
  225. severity: warning
  226. annotations:
  227. summary: Failed to setup kubelet
  228. summaryCn: kubelet安装失败
  229. - name: VolumeAttachFailed
  230. expr:
  231. kind: rule
  232. condition: type="Warning" and reason="FailedAttachVolume"
  233. desc: Failed to attach volume
  234. enable: true
  235. alerts:
  236. severity: warning
  237. annotations:
  238. summary: Failed to attach volume
  239. summaryCn: 存储卷装载失败
  240. - name: VolumeMountFailed
  241. expr:
  242. kind: rule
  243. condition: type="Warning" and reason="FailedMount"
  244. desc: Failed to mount volume
  245. enable: true
  246. alerts:
  247. severity: warning
  248. annotations:
  249. summary: Failed to mount volume
  250. summaryCn: 存储卷挂载失败
  251. - name: VolumeResizeFailed
  252. expr:
  253. kind: rule
  254. condition: type="Warning" and reason="VolumeResizeFailed"
  255. desc: Failed to expand/reduce volume
  256. enable: true
  257. alerts:
  258. severity: warning
  259. annotations:
  260. summary: Failed to expand/reduce volume
  261. summaryCn: 存储卷扩缩容失败
  262. - name: VolumeResizeSuccess
  263. expr:
  264. kind: rule
  265. condition: type="Normal" and reason="VolumeResizeSuccessful"
  266. desc: volume resize success
  267. enable: true
  268. alerts:
  269. severity: warning
  270. - name: FileSystemResizeFailed
  271. expr:
  272. kind: rule
  273. condition: type="Warning" and reason="FileSystemResizeFailed"
  274. desc: failed to expand/reduce file system
  275. enable: true
  276. alerts:
  277. severity: warning
  278. annotations:
  279. summary: Failed to expand/reduce file system
  280. summaryCn: 文件系统扩缩容失败
  281. - name: FileSystemResized
  282. expr:
  283. kind: rule
  284. condition: type="Normal" and reason="FileSystemResizeSuccessful"
  285. desc: File system resize success
  286. enable: true
  287. alerts:
  288. severity: warning
  289. - name: VolumeMapFailed
  290. expr:
  291. kind: rule
  292. condition: type="Warning" and reason="FailedMapVolume"
  293. desc: Failed to map volume
  294. enable: true
  295. alerts:
  296. severity: warning
  297. annotations:
  298. summary: Failed to map volume
  299. summaryCn: 存储卷映射失败
  300. - name: VolumeAlreadyMounted
  301. expr:
  302. kind: rule
  303. condition: type="Warning" and reason="AlreadyMountedVolume"
  304. desc: Volume is already mounted
  305. enable: true
  306. alerts:
  307. severity: warning
  308. annotations:
  309. summary: Volume is already mounted
  310. summaryCn: 存储卷已被挂载
  311. - name: VolumeAttached
  312. expr:
  313. kind: rule
  314. condition: type="Normal" and reason="SuccessfulAttachVolume"
  315. desc: Volume is attached
  316. enable: true
  317. alerts:
  318. severity: warning
  319. - name: VolumeMounted
  320. expr:
  321. kind: rule
  322. condition: type="Normal" and reason="SuccessfulMountVolume"
  323. desc: volume is mounted
  324. enable: true
  325. alerts:
  326. severity: warning
  327. - name: NodeRebooted
  328. expr:
  329. kind: rule
  330. condition: type="Warning" and involvedObject.kind="Node" and reason="Rebooted"
  331. desc: Node Rebooted
  332. enable: true
  333. alerts:
  334. severity: warning
  335. annotations:
  336. summary: Node Rebooted
  337. summaryCn: 节点重启
  338. - name: ContainerGCFailed
  339. expr:
  340. kind: rule
  341. condition: type="Warning" and reason="ContainerGCFailed"
  342. desc: Container GC failed
  343. enable: true
  344. alerts:
  345. severity: warning
  346. annotations:
  347. summary: Container GC failed
  348. summaryCn: 容器GC失败
  349. - name: ImageGCFailed
  350. expr:
  351. kind: rule
  352. condition: type="Warning" and reason="ImageGCFailed"
  353. desc: Image GC failed
  354. enable: true
  355. alerts:
  356. severity: warning
  357. annotations:
  358. summary: Image GC failed
  359. summaryCn: 镜像GC失败
  360. - name: NodeAllocatableEnforcementFailed
  361. expr:
  362. kind: rule
  363. condition: type="Warning" and reason="FailedNodeAllocatableEnforcement"
  364. desc: Node allocatable enforcement failed
  365. enable: true
  366. alerts:
  367. severity: warning
  368. annotations:
  369. summary: Node allocatable enforcement failed
  370. summaryCn: 节点可分配资源更新失败
  371. - name: NodeAllocatableEnforcedSuccess
  372. expr:
  373. kind: rule
  374. condition: type="Normal" and involvedObject.kind="Node" and reason="NodeAllocatableEnforced"
  375. desc: Node allocatable enforcement success
  376. enable: true
  377. alerts:
  378. severity: warning
  379. - name: SandboxChanged
  380. expr:
  381. kind: rule
  382. condition: type="Normal" and reason="SandboxChanged"
  383. desc: Sandbox changed
  384. enable: true
  385. alerts:
  386. severity: warning
  387. - name: SandboxCreateFailed
  388. expr:
  389. kind: rule
  390. condition: type="Warning" and reason="FailedCreatePodSandBox"
  391. desc: Failed to create sandbox
  392. enable: true
  393. alerts:
  394. severity: warning
  395. annotations:
  396. summary: Failed to create sandbox
  397. summaryCn: Sandbox创建失败
  398. - name: SandboxStatusFailed
  399. expr:
  400. kind: rule
  401. condition: type="Warning" and reason="FailedPodSandBoxStatus"
  402. desc: Failed to get sandbox status
  403. enable: true
  404. alerts:
  405. severity: warning
  406. annotations:
  407. summary: Failed to get sandbox status
  408. summaryCn: 获取Sandbox状态错误
  409. - name: DiskCapacityInvalid
  410. expr:
  411. kind: rule
  412. condition: type="Warning" and reason="InvalidDiskCapacity"
  413. desc: Invalid disk capacity
  414. enable: true
  415. alerts:
  416. severity: warning
  417. annotations:
  418. summary: Invalid disk capacity
  419. summaryCn: 磁盘容量配置不合法
  420. - name: DiskSpaceFreeFailed
  421. expr:
  422. kind: rule
  423. condition: type="Warning" and reason="FreeDiskSpaceFailed"
  424. desc: Failed to free disk space
  425. enable: true
  426. alerts:
  427. severity: warning
  428. annotations:
  429. summary: Failed to free disk space
  430. summaryCn: 磁盘空间释放失败
  431. - name: PodStatusSyncFailed
  432. expr:
  433. kind: rule
  434. condition: type="Warning" and involvedObject.kind="Pod" and reason="FailedSync"
  435. desc: Failed To Sync Pod Status
  436. enable: true
  437. alerts:
  438. severity: warning
  439. annotations:
  440. summary: Failed To Sync Pod Status
  441. summaryCn: Pod状态同步失败
  442. - name: ConfigurationValidationFaile
  443. expr:
  444. kind: rule
  445. condition: type="Warning" and involvedObject.kind="Pod" and reason="FailedValidation"
  446. desc: Configuration Validation Failed
  447. enable: true
  448. alerts:
  449. severity: warning
  450. annotations:
  451. summary: Configuration Validation Failed
  452. summaryCn: 配置验证失败
  453. - name: LifecycleHookPostStartFailed
  454. expr:
  455. kind: rule
  456. condition: type="Warning" and reason="FailedPostStartHook"
  457. desc: Failed to postStart LifecycleHook
  458. enable: true
  459. alerts:
  460. severity: warning
  461. annotations:
  462. summary: Failed to postStart LifecycleHook
  463. summaryCn: 容器启动后的生命周期钩子运行失败
  464. - name: LifecycleHookPreStopFailed
  465. expr:
  466. kind: rule
  467. condition: type="Warning" and reason="FailedPreStopHook"
  468. desc: Failed to preStop LifecycleHook
  469. enable: true
  470. alerts:
  471. severity: warning
  472. annotations:
  473. summary: Failed to preStop LifecycleHook
  474. summaryCn: 容器停止前的生命周期钩子运行失败
  475. - name: HPASelectorError
  476. expr:
  477. kind: rule
  478. condition: type="Warning" and involvedObject.kind="HorizontalPodAutoscaler" and reason in ("SelectorRequired","InvalidSelector")
  479. desc: HPA selector error
  480. enable: true
  481. alerts:
  482. severity: warning
  483. annotations:
  484. summary: HPA selector error
  485. summaryCn: HPA选择器错误
  486. - name: HPAMetricError
  487. expr:
  488. kind: rule
  489. condition: type="Warning" and involvedObject.kind="HorizontalPodAutoscaler" and reason in ("FailedGetObjectMetric","InvalidMetricSourceType")
  490. desc: Node allocatable enforcement failed
  491. enable: true
  492. alerts:
  493. severity: warning
  494. annotations:
  495. summary: HPA metric error
  496. summaryCn: HPA对象指标错误
  497. - name: HPAConvertFailed
  498. expr:
  499. kind: rule
  500. condition: type="Warning" and involvedObject.kind="HorizontalPodAutoscaler" and reason="FailedConvertHPA"
  501. desc: Failed to convert HPA
  502. enable: true
  503. alerts:
  504. severity: warning
  505. annotations:
  506. summary: Failed to convert HPA
  507. summaryCn: HPA转换失败
  508. - name: HPAGetScaleFailed
  509. expr:
  510. kind: rule
  511. condition: type="Warning" and involvedObject.kind="HorizontalPodAutoscaler" and reason="FailedGetScale"
  512. desc: Failed to get HPA scale
  513. enable: true
  514. alerts:
  515. severity: warning
  516. annotations:
  517. summary: Failed to get HPA scale
  518. summaryCn: HPA规模获取失败
  519. - name: HPAComputeReplicasFailed
  520. expr:
  521. kind: rule
  522. condition: type="Warning" and involvedObject.kind="HorizontalPodAutoscaler" and reason="FailedComputeMetricsReplicas"
  523. desc: Failed to compute HPA replicas
  524. enable: true
  525. alerts:
  526. severity: warning
  527. annotations:
  528. summary: Failed to compute HPA replicas
  529. summaryCn: HPA副本计算失败
  530. - name: HPARescaleFailed
  531. expr:
  532. kind: rule
  533. condition: type="Warning" and involvedObject.kind="HorizontalPodAutoscaler" and reason="FailedRescale"
  534. desc: Failed to rescale HPA size
  535. enable: true
  536. alerts:
  537. severity: warning
  538. annotations:
  539. summary: Failed to rescale HPA size
  540. summaryCn: HPA规模调整失败
  541. - name: HPARescaleSuccess
  542. expr:
  543. kind: rule
  544. condition: type="Normal" and involvedObject.kind="HorizontalPodAutoscaler" and reason="SuccessfulRescale"
  545. desc: Rescaled HPA size
  546. enable: true
  547. alerts:
  548. severity: warning
  549. - name: NodeSystemOOM
  550. expr:
  551. kind: rule
  552. condition: type="Warning" and involvedObject.kind="Node" and reason="SystemOOM"
  553. desc: Node system OOM encountered
  554. enable: true
  555. alerts:
  556. severity: warning
  557. annotations:
  558. summary: Node system OOM encountered
  559. summaryCn: 节点内存溢出
  560. - name: VolumeBindingFailed
  561. expr:
  562. kind: rule
  563. condition: type="Warning" and reason="FailedBinding"
  564. desc: Volume binding failed
  565. enable: true
  566. alerts:
  567. severity: warning
  568. annotations:
  569. summary: Volume binding failed
  570. summaryCn: 存储卷绑定失败
  571. - name: VolumeMismatch
  572. expr:
  573. kind: rule
  574. condition: type="Warning" and reason="VolumeMismatch"
  575. desc: Volume Mismatch
  576. enable: true
  577. alerts:
  578. severity: warning
  579. annotations:
  580. summary: Volume Mismatch
  581. summaryCn: 存储卷不匹配
  582. - name: VolumeRecycleFailed
  583. expr:
  584. kind: rule
  585. condition: type="Warning" and reason="VolumeFailedRecycle"
  586. desc: Failed to recycle volume
  587. enable: true
  588. alerts:
  589. severity: warning
  590. annotations:
  591. summary: Failed to recycle volume
  592. summaryCn: 存储卷回收失败
  593. - name: VolumeRecycled
  594. expr:
  595. kind: rule
  596. condition: type="Normal" and reason="VolumeRecycled"
  597. desc: Volume Recycled
  598. enable: true
  599. alerts:
  600. severity: warning
  601. - name: VolumeRecyclerPodError
  602. expr:
  603. kind: rule
  604. condition: type="Warning" and reason="RecyclerPod"
  605. desc: Volume Recycler pod error
  606. enable: true
  607. alerts:
  608. severity: warning
  609. annotations:
  610. summary: Volume Recycler pod error
  611. summaryCn: 存储卷回收器错误
  612. - name: VolumeDeleted
  613. expr:
  614. kind: rule
  615. condition: type="Normal" and reason="VolumeDelete"
  616. desc: Volume Deleted
  617. enable: true
  618. alerts:
  619. severity: warning
  620. - name: VolumeDeleteFailed
  621. expr:
  622. kind: rule
  623. condition: type="Warning" and reason="VolumeFailedDelete"
  624. desc: Failed to delete volume
  625. enable: true
  626. alerts:
  627. severity: warning
  628. annotations:
  629. summary: Failed to delete volume
  630. summaryCn: 存储卷删除失败
  631. - name: VolumeProvisionFailed
  632. expr:
  633. kind: rule
  634. condition: type="Warning" and reason="ProvisioningFailed"
  635. desc: Failed to provision volume
  636. enable: true
  637. alerts:
  638. severity: warning
  639. annotations:
  640. summary: Failed to provision volume
  641. summaryCn: 存储申请失败
  642. - name: VolumeProvisioned
  643. expr:
  644. kind: rule
  645. condition: type="Normal" and reason="ProvisioningSucceeded"
  646. desc: Volume provisioned
  647. enable: true
  648. alerts:
  649. severity: warning
  650. - name: VolumeProvisionCleanupFailed
  651. expr:
  652. kind: rule
  653. condition: type="Warning" and reason="ProvisioningCleanupFailed"
  654. desc: Failed to clean up provision volume
  655. enable: true
  656. alerts:
  657. severity: warning
  658. annotations:
  659. summary: Failed to clean up provision volume
  660. summaryCn: 清理存储失败
  661. - name: VolumeExternalExpandingError
  662. expr:
  663. kind: rule
  664. condition: type="Warning" and reason="ExternalExpanding"
  665. desc: Error for volume external expanding
  666. enable: true
  667. alerts:
  668. severity: warning
  669. annotations:
  670. summary: Error for volume external expanding
  671. summaryCn: 存储外部扩展错误
  672. - name: PodScheduleFailed
  673. expr:
  674. kind: rule
  675. condition: type="Warning" and involvedObject.kind="Pod" and reason="FailedScheduling"
  676. desc: Failed to schedule pod
  677. enable: true
  678. alerts:
  679. severity: warning
  680. annotations:
  681. summary: Failed to schedule pod
  682. summaryCn: pod调度失败
  683. - name: PodSchedulePreempted
  684. expr:
  685. kind: rule
  686. condition: type="Normal" and involvedObject.kind="Pod" and reason="Preempted"
  687. desc: Pod preempted
  688. enable: true
  689. alerts:
  690. severity: warning
  691. - name: PodScheduled
  692. expr:
  693. kind: rule
  694. condition: type="Normal" and involvedObject.kind="Pod" and reason="Scheduled"
  695. desc: Pod scheduled
  696. enable: true
  697. alerts:
  698. severity: warning
  699. - name: PodCreateFailed
  700. expr:
  701. kind: rule
  702. condition: type="Warning" and involvedObject.kind in ("Pod","ReplicaSet","DaemonSet","StatefulSet","Job") and reason="FailedCreate"
  703. desc: Failed to create pod
  704. enable: true
  705. alerts:
  706. severity: warning
  707. annotations:
  708. summary: Failed to create pod
  709. summaryCn: pod创建失败
  710. - name: PodCreated
  711. expr:
  712. kind: rule
  713. condition: type="Normal" and involvedObject.kind in ("Pod","ReplicaSet","DaemonSet","StatefulSet","Job") and reason="SuccessfulCreate"
  714. desc: pod created
  715. enable: true
  716. alerts:
  717. severity: warning
  718. - name: PodDeleteFailed
  719. expr:
  720. kind: rule
  721. condition: type="Warning" and involvedObject.kind in ("Pod","ReplicaSet","DaemonSet","StatefulSet","Job") and reason="FailedDelete"
  722. desc: Failed to delete pod
  723. enable: true
  724. alerts:
  725. severity: warning
  726. annotations:
  727. summary: Failed to delete pod
  728. summaryCn: pod删除失败
  729. - name: PodDeleted
  730. expr:
  731. kind: rule
  732. condition: type="Normal" and involvedObject.kind in ("Pod","ReplicaSet","DaemonSet","StatefulSet","Job") and reason="SuccessfulDelete"
  733. desc: pod deleted
  734. enable: true
  735. alerts:
  736. severity: warning
  737. - name: ReplicaSetCreateError
  738. expr:
  739. kind: rule
  740. condition: type="Warning" and reason="ReplicaSetCreateError"
  741. desc: Error to create replica set for deployment
  742. enable: true
  743. alerts:
  744. severity: warning
  745. annotations:
  746. summary: Error to create replica set for deployment
  747. summaryCn: 副本集创建错误
  748. - name: DeploymentRollbackFailed
  749. expr:
  750. kind: rule
  751. condition: type="Warning" and reason in("DeploymentRollbackRevisionNotFound","DeploymentRollbackTemplateUnchanged")
  752. desc: Failed to rollback deployment
  753. enable: true
  754. alerts:
  755. severity: warning
  756. annotations:
  757. summary: Failed to rollback deployment
  758. summaryCn: 部署回滚失败
  759. - name: DeploySelectorAll
  760. expr:
  761. kind: rule
  762. condition: type="Warning" and involvedObject.kind="Deployment" and reason="SelectingAll"
  763. desc: The deploy is selecting all pods
  764. enable: true
  765. alerts:
  766. severity: warning
  767. annotations:
  768. summary: The deploy is selecting all pods
  769. summaryCn: deploy选择了所有pod
  770. - name: DaemonSelectorAll
  771. expr:
  772. kind: rule
  773. condition: type="Warning" and involvedObject.kind="DaemonSet" and reason="SelectingAll"
  774. desc: The daemon set is selecting all pods
  775. enable: true
  776. alerts:
  777. severity: warning
  778. annotations:
  779. summary: The daemon set is selecting all pods
  780. summaryCn: daemonset选择了所有pod
  781. - name: DaemonPodFailed
  782. expr:
  783. kind: rule
  784. condition: type="Warning" and involvedObject.kind="DaemonSet" and reason in ("FailedDaemonPod","FailedPlacement")
  785. desc: Failed daemon pod
  786. enable: true
  787. alerts:
  788. severity: warning
  789. annotations:
  790. summary: Failed daemon pod
  791. summaryCn: daemonsetpod失败
  792. - name: LoadBalancerSyncFailed
  793. expr:
  794. kind: rule
  795. condition: type="Warning" and reason="SyncLoadBalancerFailed"
  796. desc: Error syncing load balancer
  797. enable: true
  798. alerts:
  799. severity: warning
  800. annotations:
  801. summary: Error syncing load balancer
  802. summaryCn: 负载据衡器不可用
  803. - name: LoadBalancerDeleting
  804. expr:
  805. kind: rule
  806. condition: type="Normal" and reason="DeletingLoadBalancer"
  807. desc: LoadBalancer is deleting
  808. enable: true
  809. alerts:
  810. severity: warning
  811. - name: LoadBalancerEnsuring
  812. expr:
  813. kind: rule
  814. condition: type="Normal" and reason="EnsuringLoadBalancer"
  815. desc: LoadBalancer is ensuring
  816. enable: true
  817. alerts:
  818. severity: warning
  819. - name: LoadBalancerEnsured
  820. expr:
  821. kind: rule
  822. condition: type="Normal" and reason="EnsuredLoadBalancer"
  823. desc: LoadBalancer is ensured
  824. enable: true
  825. alerts:
  826. severity: warning
  827. - name: LoadBalancerUnAvailable
  828. expr:
  829. kind: rule
  830. condition: type="Warning" and reason="UnAvailableLoadBalancer"
  831. desc: Load balancer is not available
  832. enable: true
  833. alerts:
  834. severity: warning
  835. annotations:
  836. summary: Load balancer is not available
  837. summaryCn: 负载据衡器不可用
  838. - name: LoadBalancerUpdated
  839. expr:
  840. kind: rule
  841. condition: type="Normal" and reason="UpdatedLoadBalancer"
  842. desc: LoadBalancer is updated
  843. enable: true
  844. alerts:
  845. severity: warning
  846. - name: LoadBalancerUpdateFailed
  847. expr:
  848. kind: rule
  849. condition: type="Warning" and reason="UpdateLoadBalancerFailed"
  850. desc: Failed to update load balancer
  851. enable: true
  852. alerts:
  853. severity: warning
  854. annotations:
  855. summary: Failed to update load balancer
  856. summaryCn: 更新负载据衡器失败
  857. - name: LoadBalancerDeleting
  858. expr:
  859. kind: rule
  860. condition: type="Normal" and reason="DeletingLoadBalancer"
  861. desc: Failed To Sync Pod Status
  862. enable: true
  863. alerts:
  864. severity: warning
  865. - name: LoadBalancerDeleted
  866. expr:
  867. kind: rule
  868. condition: type="Normal" and reason="DeletedLoadBalancer"
  869. desc: LoadBalancer is deleted
  870. enable: true
  871. alerts:
  872. severity: warning
  873. - name: VolumeDeleted
  874. expr:
  875. kind: rule
  876. condition: type="Normal" and reason="VolumeDelete"
  877. desc: Volume is deleted
  878. enable: true
  879. alerts:
  880. severity: warning
  881. - name: LoadBalancerDeleteFailed
  882. expr:
  883. kind: rule
  884. condition: type="Warning" and reason="DeleteLoadBalancerFailed"
  885. desc: Failed to delete load balancer
  886. enable: true
  887. alerts:
  888. severity: warning
  889. annotations:
  890. summary: Failed to delete load balancer
  891. summaryCn: 负载据衡器删除失败
  892. - name: JobGetFailed
  893. expr:
  894. kind: rule
  895. condition: type="Warning" and involvedObject.kind="CronJob" and reason="FailedGet"
  896. desc: Failed to get job
  897. enable: true
  898. alerts:
  899. severity: warning
  900. annotations:
  901. summary: Failed to get job
  902. summaryCn: 任务获取失败
  903. - name: JobCreated
  904. expr:
  905. kind: rule
  906. condition: type="Normal" and involvedObject.kind="CronJob" and reason="SuccessfulCreate"
  907. desc: job is created
  908. enable: true
  909. alerts:
  910. severity: warning
  911. - name: JobCreateFailed
  912. expr:
  913. kind: rule
  914. condition: type="Warning" and involvedObject.kind="CronJob" and reason="FailedCreate"
  915. desc: Failed to create job
  916. enable: true
  917. alerts:
  918. severity: warning
  919. annotations:
  920. summary: Failed to create job
  921. summaryCn: 任务创建失败
  922. - name: JobDeleted
  923. expr:
  924. kind: rule
  925. condition: type="Normal" and involvedObject.kind="CronJob" and reason="SuccessfulDelete"
  926. desc: job is deleted
  927. enable: true
  928. alerts:
  929. severity: warning
  930. - name: JobDeleteFailed
  931. expr:
  932. kind: rule
  933. condition: type="Warning" and involvedObject.kind="CronJob" and reason="FailedDelete"
  934. desc: Failed to delete job
  935. enable: true
  936. alerts:
  937. severity: warning
  938. annotations:
  939. summary: Failed to delete job
  940. summaryCn: 任务删除失败
  941. - name: JobCompleted
  942. expr:
  943. kind: rule
  944. condition: type="Normal" and involvedObject.kind="CronJob" and reason="SawCompletedJob"
  945. desc: job is completed
  946. enable: true
  947. alerts:
  948. severity: warning
  949. - name: JobUnexpected
  950. expr:
  951. kind: rule
  952. condition: type="Warning" and involvedObject.kind="CronJob" and reason="UnexpectedJob"
  953. desc: CronJob saw unexpected job
  954. enable: true
  955. alerts:
  956. severity: warning
  957. annotations:
  958. summary: CronJob saw unexpected job
  959. summaryCn: 任务非预期
  960. - name: JobMissing
  961. expr:
  962. kind: rule
  963. condition: type="Normal" and involvedObject.kind="CronJob" and reason="MissingJob"
  964. desc: CronJob missed expected job
  965. enable: true
  966. alerts:
  967. severity: warning
  968. - name: JobScheduleFailed
  969. expr:
  970. kind: rule
  971. condition: type="Warning" and involvedObject.kind="CronJob" and reason in ("MissSchedule","FailedNeedsStart")
  972. desc: CronJob failed to schedule job
  973. enable: true
  974. alerts:
  975. severity: warning
  976. annotations:
  977. summary: CronJob failed to schedule job
  978. summaryCn: 任务调度失败

审计告警规则示例

  1. apiVersion: logging.whizard.io/v1alpha1
  2. kind: ClusterRuleGroup
  3. metadata:
  4. name: auditing-rules
  5. spec:
  6. type: auditing
  7. rules:
  8. - name: ignore-action
  9. expr:
  10. kind: list
  11. list:
  12. - get
  13. - list
  14. - watch
  15. desc: all action not need to be audit
  16. - name: action
  17. expr:
  18. kind: list
  19. list:
  20. - create
  21. - delete
  22. - update
  23. - patch
  24. desc: all operator need to be audit
  25. - name: pod
  26. expr:
  27. kind: macro
  28. macro: ObjectRef.Resource="pods"
  29. desc: pod
  30. - name: service
  31. expr:
  32. kind: macro
  33. macro: ObjectRef.Resource="services"
  34. desc: service
  35. - name: user
  36. expr:
  37. kind: alias
  38. alias: User.username
  39. desc: the alias of the user related to audit event
  40. - name: name
  41. expr:
  42. kind: alias
  43. alias: ObjectRef.Name
  44. desc: the alias of the resource name
  45. - name: namespace
  46. expr:
  47. kind: alias
  48. alias: ObjectRef.Namespace
  49. desc: the alias of the resource namespace
  50. - name: create
  51. expr:
  52. kind: macro
  53. macro: Verb = "create"
  54. desc: create operator
  55. - name: ResourceChange
  56. expr:
  57. kind: rule
  58. condition: Verb in ${action}
  59. desc: audit the change of resource
  60. enable: true
  61. alerts:
  62. severity: info
  63. - name: CreateHostNetworkPod
  64. expr:
  65. kind: rule
  66. condition: ${pod} and ${create} and RequestObject.spec.hostNetwork = true
  67. desc: Detect an attempt to start a pod using the host network
  68. alerts:
  69. severity: warning
  70. annotations:
  71. summary: creat hostNetwork pod
  72. summaryCn: 创建 hostNetwork 容器
  73. message: ${user} ${Verb} HostNetwork Pod ${name} in Namespace ${namespace}.
  74. enable: true
  75. - name: CreateHostportPod
  76. expr:
  77. kind: rule
  78. condition: ${pod} and ${create} and (RequestObject.spec.containers[*].ports[*].hostPort > 0 or RequestObject.spec.initContainers[*].ports[*].hostPort > 0)
  79. desc: Detect an attempt to start a pod mount to a host port
  80. enable: true
  81. alerts:
  82. severity: warning
  83. annotations:
  84. summary: creat hostport pod
  85. summaryCn: 创建 hostport 容器
  86. message: ${user} ${Verb} HostPort Pod ${name} in Namespace ${namespace}.
  87. - name: CreateNodePortService
  88. expr:
  89. kind: rule
  90. condition: ${service} and ${create} and RequestObject.spec.type = "NodePort"
  91. desc: Detect an attempt to start a service with a NodePort service type
  92. enable: true
  93. alerts:
  94. severity: warning
  95. annotations:
  96. summary: creat NodePort service
  97. summaryCn: 创建 NodePort 服务
  98. message: ${user} ${Verb} NodePort Service ${name} in Namespace ${namespace}.
  99. - name: AttachOrExecPod
  100. expr:
  101. kind: rule
  102. condition: ${pod} and ${create} and ObjectRef.Subresource in ("exec", "attach")
  103. desc: Detect any attempt to attach/exec to a pod
  104. alerts:
  105. severity: warning
  106. annotations:
  107. summary: attach or exec pod
  108. summaryCn: 进入容器
  109. message: ${user} ${ObjectRef.Subresource} Pod ${name} in Namespace ${namespace}.