Calculating a moving average

Problem

You want to calculate a moving average.

Solution

Suppose your data is a noisy sine wave with some missing values:

  1. set.seed(993)
  2. x <- 1:300
  3. y <- sin(x/20) + rnorm(300,sd=.1)
  4. y[251:255] <- NA

The filter() function can be used to calculate a moving average.

  1. # Plot the unsmoothed data (gray)
  2. plot(x, y, type="l", col=grey(.5))
  3. # Draw gridlines
  4. grid()
  5. # Smoothed with lag:
  6. # average of current sample and 19 previous samples (red)
  7. f20 <- rep(1/20, 20)
  8. f20
  9. #> [1] 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05
  10. #> [18] 0.05 0.05 0.05
  11. y_lag <- filter(y, f20, sides=1)
  12. lines(x, y_lag, col="red")
  13. # Smoothed symmetrically:
  14. # average of current sample, 10 future samples, and 10 past samples (blue)
  15. f21 <- rep(1/21,21)
  16. f21
  17. #> [1] 0.04761905 0.04761905 0.04761905 0.04761905 0.04761905 0.04761905 0.04761905
  18. #> [8] 0.04761905 0.04761905 0.04761905 0.04761905 0.04761905 0.04761905 0.04761905
  19. #> [15] 0.04761905 0.04761905 0.04761905 0.04761905 0.04761905 0.04761905 0.04761905
  20. y_sym <- filter(y, f21, sides=2)
  21. lines(x, y_sym, col="blue")

plot of chunk unnamed-chunk-2

filter() will leave holes wherever it encounters missing values, as shown in the graph above.

A different way to handle missing data is to simply ignore it, and not include it in the average. The function defined here will do that.

  1. # x: the vector
  2. # n: the number of samples
  3. # centered: if FALSE, then average current sample and previous (n-1) samples
  4. # if TRUE, then average symmetrically in past and future. (If n is even, use one more sample from future.)
  5. movingAverage <- function(x, n=1, centered=FALSE) {
  6. if (centered) {
  7. before <- floor ((n-1)/2)
  8. after <- ceiling((n-1)/2)
  9. } else {
  10. before <- n-1
  11. after <- 0
  12. }
  13. # Track the sum and count of number of non-NA items
  14. s <- rep(0, length(x))
  15. count <- rep(0, length(x))
  16. # Add the centered data
  17. new <- x
  18. # Add to count list wherever there isn't a
  19. count <- count + !is.na(new)
  20. # Now replace NA_s with 0_s and add to total
  21. new[is.na(new)] <- 0
  22. s <- s + new
  23. # Add the data from before
  24. i <- 1
  25. while (i <= before) {
  26. # This is the vector with offset values to add
  27. new <- c(rep(NA, i), x[1:(length(x)-i)])
  28. count <- count + !is.na(new)
  29. new[is.na(new)] <- 0
  30. s <- s + new
  31. i <- i+1
  32. }
  33. # Add the data from after
  34. i <- 1
  35. while (i <= after) {
  36. # This is the vector with offset values to add
  37. new <- c(x[(i+1):length(x)], rep(NA, i))
  38. count <- count + !is.na(new)
  39. new[is.na(new)] <- 0
  40. s <- s + new
  41. i <- i+1
  42. }
  43. # return sum divided by count
  44. s/count
  45. }
  46. # Make same plots from before, with thicker lines
  47. plot(x, y, type="l", col=grey(.5))
  48. grid()
  49. y_lag <- filter(y, rep(1/20, 20), sides=1)
  50. lines(x, y_lag, col="red", lwd=4) # Lagged average in red
  51. y_sym <- filter(y, rep(1/21,21), sides=2)
  52. lines(x, y_sym, col="blue", lwd=4) # Symmetrical average in blue
  53. # Calculate lagged moving average with new method and overplot with green
  54. y_lag_na.rm <- movingAverage(y, 20)
  55. lines(x, y_lag_na.rm, col="green", lwd=2)
  56. # Calculate symmetrical moving average with new method and overplot with green
  57. y_sym_na.rm <- movingAverage(y, 21, TRUE)
  58. lines(x, y_sym_na.rm, col="green", lwd=2)

plot of chunk unnamed-chunk-3