6.20 计算移动平均数
6.20.1 问题
你想要计算移动平均数。
6.20.2 解决方案
假设你的数据是带缺失值的噪声正弦波。
set.seed(993)
x <- 1:300
y <- sin(x/20) + rnorm(300, sd = 0.1)
y[251:255] <- NA
filter()
函数可以用来计算移动平均数。
# 绘制未平滑的数据(灰色)
plot(x, y, type = "l", col = grey(0.5))
# 绘制网格
grid()
# 延迟平滑 当前样本和之前 19 个样本的平均数(红色)
f20 <- rep(1/20, 20)
f20
#> [1] 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05
#> [11] 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05
y_lag <- filter(y, f20, sides = 1)
lines(x, y_lag, col = "red")
# 对称性平滑:
# 计算当前样本,时间上将来样本和过去10个样本的平均数(蓝色)
f21 <- rep(1/21, 21)
f21
#> [1] 0.04762 0.04762 0.04762 0.04762 0.04762 0.04762
#> [7] 0.04762 0.04762 0.04762 0.04762 0.04762 0.04762
#> [13] 0.04762 0.04762 0.04762 0.04762 0.04762 0.04762
#> [19] 0.04762 0.04762 0.04762
y_sym <- filter(y, f21, sides = 2)
lines(x, y_sym, col = "blue")
filter()
会在遭遇缺失值时留下空缺,就像上面图中显示的一样。
一种处理缺失值的方法是简单地忽略它,不把它包含在平均数的计算中。这个功能可以由下面的函数实现:
# x: the vector n: the number of samples centered: if
# FALSE, then average current sample and previous (n-1)
# samples if TRUE, then average symmetrically in past
# and future. (If n is even, use one more sample from
# future.)
movingAverage <- function(x, n = 1, centered = FALSE) {
if (centered) {
before <- floor((n - 1)/2)
after <- ceiling((n - 1)/2)
} else {
before <- n - 1
after <- 0
}
# Track the sum and count of number of non-NA items
s <- rep(0, length(x))
count <- rep(0, length(x))
# Add the centered data
new <- x
# Add to count list wherever there isn't a
count <- count + !is.na(new)
# Now replace NA_s with 0_s and add to total
new[is.na(new)] <- 0
s <- s + new
# Add the data from before
i <- 1
while (i <= before) {
# This is the vector with offset values to add
new <- c(rep(NA, i), x[1:(length(x) - i)])
count <- count + !is.na(new)
new[is.na(new)] <- 0
s <- s + new
i <- i + 1
}
# Add the data from after
i <- 1
while (i <= after) {
# This is the vector with offset values to add
new <- c(x[(i + 1):length(x)], rep(NA, i))
count <- count + !is.na(new)
new[is.na(new)] <- 0
s <- s + new
i <- i + 1
}
# return sum divided by count
s/count
}
# 用比较厚的线条绘制和之前一样的图
plot(x, y, type = "l", col = grey(0.5))
grid()
y_lag <- filter(y, rep(1/20, 20), sides = 1)
lines(x, y_lag, col = "red", lwd = 4) # 用红色表示延迟平均
y_sym <- filter(y, rep(1/21, 21), sides = 2)
lines(x, y_sym, col = "blue", lwd = 4) # 用蓝色表示对称平均
# 用上面定义的函数计算延迟平均
y_lag_na.rm <- movingAverage(y, 20)
lines(x, y_lag_na.rm, col = "green", lwd = 2)
# 用上面定义的函数计算对称性平均
y_sym_na.rm <- movingAverage(y, 21, TRUE)
lines(x, y_sym_na.rm, col = "green", lwd = 2)