Google提出了应用监控的4个黄金指标,分别是:流量、延迟、错误、饱和度,其中前面3个指标都可以通过内嵌SDK的方式埋点采集。夜莺核心模块有两个,webapi主要是提供http接口给JavaScript调用,server主要是负责接收监控数据,处理告警规则,这两个模块都引入了Prometheus的Go的SDK,用此方式做App Performance监控,本节以夜莺的代码为例,讲解如何使用Prometheus的SDK。



path着重说一下,表示请求路径,比如上面提到的/api/n9e/users,但是,在restful实践中,url中经常会有参数,比如获取编号为1的用户的信息,接口是/api/n9e/user/1,获取编号为2的用户信息,接口是/api/n9e/user/2,如果这俩带有用户编号的url都作为Label,会造成时序库索引爆炸,而且从业务方使用角度来看,我们也不关注编号为1的用户获取请求还是编号为2的用户获取请求,而是关注整体的GET /api/n9e/user/:id这个接口的监控数据。所以我们在设置Label的时候,要把path设置为/api/n9e/user/:id,而不是那具体的带有用户编号的url路径。夜莺用的gin框架,gin框架有个FullPath方法就是获取这个信息的,比较方便。


  1. package stat
  2. import (
  3. "time"
  4. ""
  5. )
  6. const Service = "n9e-webapi"
  7. var (
  8. labels = []string{"service", "code", "path", "method"}
  9. uptime = prometheus.NewCounterVec(
  10. prometheus.CounterOpts{
  11. Name: "uptime",
  12. Help: "HTTP service uptime.",
  13. }, []string{"service"},
  14. )
  15. RequestCounter = prometheus.NewCounterVec(
  16. prometheus.CounterOpts{
  17. Name: "http_request_count_total",
  18. Help: "Total number of HTTP requests made.",
  19. }, labels,
  20. )
  21. RequestDuration = prometheus.NewHistogramVec(
  22. prometheus.HistogramOpts{
  23. Buckets: []float64{.01, .1, 1, 10},
  24. Name: "http_request_duration_seconds",
  25. Help: "HTTP request latencies in seconds.",
  26. }, labels,
  27. )
  28. )
  29. func Init() {
  30. // Register the summary and the histogram with Prometheus's default registry.
  31. prometheus.MustRegister(
  32. uptime,
  33. RequestCounter,
  34. RequestDuration,
  35. )
  36. go recordUptime()
  37. }
  38. // recordUptime increases service uptime per second.
  39. func recordUptime() {
  40. for range time.Tick(time.Second) {
  41. uptime.WithLabelValues(Service).Inc()
  42. }
  43. }



  1. import (
  2. ...
  3. promstat ""
  4. )
  5. func stat() gin.HandlerFunc {
  6. return func(c *gin.Context) {
  7. start := time.Now()
  8. c.Next()
  9. code := fmt.Sprintf("%d", c.Writer.Status())
  10. method := c.Request.Method
  11. labels := []string{promstat.Service, code, c.FullPath(), method}
  12. promstat.RequestCounter.WithLabelValues(labels...).Inc()
  13. promstat.RequestDuration.WithLabelValues(labels...).Observe(float64(time.Since(start).Seconds()))
  14. }
  15. }


  1. ...
  2. r := gin.New()
  3. r.Use(stat())
  4. ...


  1. import (
  2. ...
  3. ""
  4. )
  5. func configRoute(r *gin.Engine, version string) {
  6. ...
  7. r.GET("/metrics", gin.WrapH(promhttp.Handler()))
  8. }





  1. package stat
  2. import (
  3. ""
  4. )
  5. const (
  6. namespace = "n9e"
  7. subsystem = "server"
  8. )
  9. var (
  10. // 各个周期性任务的执行耗时
  11. GaugeCronDuration = prometheus.NewGaugeVec(prometheus.GaugeOpts{
  12. Namespace: namespace,
  13. Subsystem: subsystem,
  14. Name: "cron_duration",
  15. Help: "Cron method use duration, unit: ms.",
  16. }, []string{"cluster", "name"})
  17. // 从数据库同步数据的时候,同步的条数
  18. GaugeSyncNumber = prometheus.NewGaugeVec(prometheus.GaugeOpts{
  19. Namespace: namespace,
  20. Subsystem: subsystem,
  21. Name: "cron_sync_number",
  22. Help: "Cron sync number.",
  23. }, []string{"cluster", "name"})
  24. // 从各个接收接口接收到的监控数据总量
  25. CounterSampleTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
  26. Namespace: namespace,
  27. Subsystem: subsystem,
  28. Name: "samples_received_total",
  29. Help: "Total number samples received.",
  30. }, []string{"cluster", "channel"})
  31. // 产生的告警总量
  32. CounterAlertsTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
  33. Namespace: namespace,
  34. Subsystem: subsystem,
  35. Name: "alerts_total",
  36. Help: "Total number alert events.",
  37. }, []string{"cluster"})
  38. // 内存中的告警事件队列的长度
  39. GaugeAlertQueueSize = prometheus.NewGaugeVec(prometheus.GaugeOpts{
  40. Namespace: namespace,
  41. Subsystem: subsystem,
  42. Name: "alert_queue_size",
  43. Help: "The size of alert queue.",
  44. }, []string{"cluster"})
  45. )
  46. func Init() {
  47. // Register the summary and the histogram with Prometheus's default registry.
  48. prometheus.MustRegister(
  49. GaugeCronDuration,
  50. GaugeSyncNumber,
  51. CounterSampleTotal,
  52. CounterAlertsTotal,
  53. GaugeAlertQueueSize,
  54. )
  55. }


  1. package engine
  2. import (
  3. "context"
  4. "time"
  5. ""
  6. promstat ""
  7. )
  8. func Start(ctx context.Context) error {
  9. ...
  10. go reportQueueSize()
  11. return nil
  12. }
  13. func reportQueueSize() {
  14. for {
  15. time.Sleep(time.Second)
  16. promstat.GaugeAlertQueueSize.WithLabelValues(config.C.ClusterName).Set(float64(EventQueue.Len()))
  17. }
  18. }



