Bucket count K-S test correlation aggregation

Bucket count K-S test correlation aggregation

A sibling pipeline aggregation which executes a two sample Kolmogorov–Smirnov test (referred to as a “K-S test” from now on) against a provided distribution, and the distribution implied by the documents counts in the configured sibling aggregation. Specifically, for some metric, assuming that the percentile intervals of the metric are known beforehand or have been computed by an aggregation, then one would use range aggregation for the sibling to compute the p-value of the distribution difference between the metric and the restriction of that metric to a subset of the documents. A natural use case is if the sibling aggregation range aggregation nested in a terms aggregation, in which case one compares the overall distribution of metric to its restriction to each term.

Parameters

buckets_path

(Required, string) Path to the buckets that contain one set of values to correlate. Must be a _count path For syntax, see buckets_path Syntax.

alternative

(Optional, list) A list of string values indicating which K-S test alternative to calculate. The valid values are: “greater”, “less”, “two_sided”. This parameter is key for determining the K-S statistic used when calculating the K-S test. Default value is all possible alternative hypotheses.

fractions

(Optional, list) A list of doubles indicating the distribution of the samples with which to compare to the buckets_path results. In typical usage this is the overall proportion of documents in each bucket, which is compared with the actual document proportions in each bucket from the sibling aggregation counts. The default is to assume that overall documents are uniformly distributed on these buckets, which they would be if one used equal percentiles of a metric to define the bucket end points.

sampling_method

(Optional, string) Indicates the sampling methodology when calculating the K-S test. Note, this is sampling of the returned values. This determines the cumulative distribution function (CDF) points used comparing the two samples. Default is upper_tail, which emphasizes the upper end of the CDF points. Valid options are: upper_tail, uniform, and lower_tail.

Syntax

A bucket_count_ks_test aggregation looks like this in isolation:

  1. {
  2. "bucket_count_ks_test": {
  3. "buckets_path": "range_values>_count",
  4. "alternative": ["less", "greater", "two_sided"],
  5. "sampling_method": "upper_tail"
  6. }
  7. }

The buckets containing the values to test against.

The alternatives to calculate.

The sampling method for the K-S statistic.

Example

The following snippet runs the bucket_count_ks_test on the individual terms in the field version against a uniform distribution. The uniform distribution reflects the latency percentile buckets. Not shown is the pre-calculation of the latency indicator values, which was done utilizing the percentiles aggregation.

This example is only using the deciles of latency.

  1. resp = client.search(
  2. index="correlate_latency",
  3. size="0",
  4. filter_path="aggregations",
  5. aggs={
  6. "buckets": {
  7. "terms": {
  8. "field": "version",
  9. "size": 2
  10. },
  11. "aggs": {
  12. "latency_ranges": {
  13. "range": {
  14. "field": "latency",
  15. "ranges": [
  16. {
  17. "to": 0
  18. },
  19. {
  20. "from": 0,
  21. "to": 105
  22. },
  23. {
  24. "from": 105,
  25. "to": 225
  26. },
  27. {
  28. "from": 225,
  29. "to": 445
  30. },
  31. {
  32. "from": 445,
  33. "to": 665
  34. },
  35. {
  36. "from": 665,
  37. "to": 885
  38. },
  39. {
  40. "from": 885,
  41. "to": 1115
  42. },
  43. {
  44. "from": 1115,
  45. "to": 1335
  46. },
  47. {
  48. "from": 1335,
  49. "to": 1555
  50. },
  51. {
  52. "from": 1555,
  53. "to": 1775
  54. },
  55. {
  56. "from": 1775
  57. }
  58. ]
  59. }
  60. },
  61. "ks_test": {
  62. "bucket_count_ks_test": {
  63. "buckets_path": "latency_ranges>_count",
  64. "alternative": [
  65. "less",
  66. "greater",
  67. "two_sided"
  68. ]
  69. }
  70. }
  71. }
  72. }
  73. },
  74. )
  75. print(resp)
  1. const response = await client.search({
  2. index: "correlate_latency",
  3. size: 0,
  4. filter_path: "aggregations",
  5. aggs: {
  6. buckets: {
  7. terms: {
  8. field: "version",
  9. size: 2,
  10. },
  11. aggs: {
  12. latency_ranges: {
  13. range: {
  14. field: "latency",
  15. ranges: [
  16. {
  17. to: 0,
  18. },
  19. {
  20. from: 0,
  21. to: 105,
  22. },
  23. {
  24. from: 105,
  25. to: 225,
  26. },
  27. {
  28. from: 225,
  29. to: 445,
  30. },
  31. {
  32. from: 445,
  33. to: 665,
  34. },
  35. {
  36. from: 665,
  37. to: 885,
  38. },
  39. {
  40. from: 885,
  41. to: 1115,
  42. },
  43. {
  44. from: 1115,
  45. to: 1335,
  46. },
  47. {
  48. from: 1335,
  49. to: 1555,
  50. },
  51. {
  52. from: 1555,
  53. to: 1775,
  54. },
  55. {
  56. from: 1775,
  57. },
  58. ],
  59. },
  60. },
  61. ks_test: {
  62. bucket_count_ks_test: {
  63. buckets_path: "latency_ranges>_count",
  64. alternative: ["less", "greater", "two_sided"],
  65. },
  66. },
  67. },
  68. },
  69. },
  70. });
  71. console.log(response);
  1. POST correlate_latency/_search?size=0&filter_path=aggregations
  2. {
  3. "aggs": {
  4. "buckets": {
  5. "terms": {
  6. "field": "version",
  7. "size": 2
  8. },
  9. "aggs": {
  10. "latency_ranges": {
  11. "range": {
  12. "field": "latency",
  13. "ranges": [
  14. { "to": 0 },
  15. { "from": 0, "to": 105 },
  16. { "from": 105, "to": 225 },
  17. { "from": 225, "to": 445 },
  18. { "from": 445, "to": 665 },
  19. { "from": 665, "to": 885 },
  20. { "from": 885, "to": 1115 },
  21. { "from": 1115, "to": 1335 },
  22. { "from": 1335, "to": 1555 },
  23. { "from": 1555, "to": 1775 },
  24. { "from": 1775 }
  25. ]
  26. }
  27. },
  28. "ks_test": {
  29. "bucket_count_ks_test": {
  30. "buckets_path": "latency_ranges>_count",
  31. "alternative": ["less", "greater", "two_sided"]
  32. }
  33. }
  34. }
  35. }
  36. }
  37. }

The term buckets containing a range aggregation and the bucket correlation aggregation. Both are utilized to calculate the correlation of the term values with the latency.

The range aggregation on the latency field. The ranges were created referencing the percentiles of the latency field.

The bucket count K-S test aggregation that tests if the bucket counts comes from the same distribution as fractions; where fractions is a uniform distribution.

And the following may be the response:

  1. {
  2. "aggregations" : {
  3. "buckets" : {
  4. "doc_count_error_upper_bound" : 0,
  5. "sum_other_doc_count" : 0,
  6. "buckets" : [
  7. {
  8. "key" : "1.0",
  9. "doc_count" : 100,
  10. "latency_ranges" : {
  11. "buckets" : [
  12. {
  13. "key" : "*-0.0",
  14. "to" : 0.0,
  15. "doc_count" : 0
  16. },
  17. {
  18. "key" : "0.0-105.0",
  19. "from" : 0.0,
  20. "to" : 105.0,
  21. "doc_count" : 1
  22. },
  23. {
  24. "key" : "105.0-225.0",
  25. "from" : 105.0,
  26. "to" : 225.0,
  27. "doc_count" : 9
  28. },
  29. {
  30. "key" : "225.0-445.0",
  31. "from" : 225.0,
  32. "to" : 445.0,
  33. "doc_count" : 0
  34. },
  35. {
  36. "key" : "445.0-665.0",
  37. "from" : 445.0,
  38. "to" : 665.0,
  39. "doc_count" : 0
  40. },
  41. {
  42. "key" : "665.0-885.0",
  43. "from" : 665.0,
  44. "to" : 885.0,
  45. "doc_count" : 0
  46. },
  47. {
  48. "key" : "885.0-1115.0",
  49. "from" : 885.0,
  50. "to" : 1115.0,
  51. "doc_count" : 10
  52. },
  53. {
  54. "key" : "1115.0-1335.0",
  55. "from" : 1115.0,
  56. "to" : 1335.0,
  57. "doc_count" : 20
  58. },
  59. {
  60. "key" : "1335.0-1555.0",
  61. "from" : 1335.0,
  62. "to" : 1555.0,
  63. "doc_count" : 20
  64. },
  65. {
  66. "key" : "1555.0-1775.0",
  67. "from" : 1555.0,
  68. "to" : 1775.0,
  69. "doc_count" : 20
  70. },
  71. {
  72. "key" : "1775.0-*",
  73. "from" : 1775.0,
  74. "doc_count" : 20
  75. }
  76. ]
  77. },
  78. "ks_test" : {
  79. "less" : 2.248673241788478E-4,
  80. "greater" : 1.0,
  81. "two_sided" : 5.791639181800257E-4
  82. }
  83. },
  84. {
  85. "key" : "2.0",
  86. "doc_count" : 100,
  87. "latency_ranges" : {
  88. "buckets" : [
  89. {
  90. "key" : "*-0.0",
  91. "to" : 0.0,
  92. "doc_count" : 0
  93. },
  94. {
  95. "key" : "0.0-105.0",
  96. "from" : 0.0,
  97. "to" : 105.0,
  98. "doc_count" : 19
  99. },
  100. {
  101. "key" : "105.0-225.0",
  102. "from" : 105.0,
  103. "to" : 225.0,
  104. "doc_count" : 11
  105. },
  106. {
  107. "key" : "225.0-445.0",
  108. "from" : 225.0,
  109. "to" : 445.0,
  110. "doc_count" : 20
  111. },
  112. {
  113. "key" : "445.0-665.0",
  114. "from" : 445.0,
  115. "to" : 665.0,
  116. "doc_count" : 20
  117. },
  118. {
  119. "key" : "665.0-885.0",
  120. "from" : 665.0,
  121. "to" : 885.0,
  122. "doc_count" : 20
  123. },
  124. {
  125. "key" : "885.0-1115.0",
  126. "from" : 885.0,
  127. "to" : 1115.0,
  128. "doc_count" : 10
  129. },
  130. {
  131. "key" : "1115.0-1335.0",
  132. "from" : 1115.0,
  133. "to" : 1335.0,
  134. "doc_count" : 0
  135. },
  136. {
  137. "key" : "1335.0-1555.0",
  138. "from" : 1335.0,
  139. "to" : 1555.0,
  140. "doc_count" : 0
  141. },
  142. {
  143. "key" : "1555.0-1775.0",
  144. "from" : 1555.0,
  145. "to" : 1775.0,
  146. "doc_count" : 0
  147. },
  148. {
  149. "key" : "1775.0-*",
  150. "from" : 1775.0,
  151. "doc_count" : 0
  152. }
  153. ]
  154. },
  155. "ks_test" : {
  156. "less" : 0.9642895789647244,
  157. "greater" : 4.58718174664754E-9,
  158. "two_sided" : 5.916656831139733E-9
  159. }
  160. }
  161. ]
  162. }
  163. }
  164. }