This webpage http://www.stat.pitt.edu/stoffer/tsa2/Rissues.htm highlights many issues when using R for time series analysis. I strongly recommend you read the webpage carefully before you start analyzing your data with R.
library(TSA)
library(forecast)
library(ggplot2)
library(scales)
The dataset is the color dataset from the textbook. We begin by studying the time series plot, ACF plot and the PACF plot.
color <- read.table("http://homepage.divms.uiowa.edu/~kchan/TSA/Datasets/color.dat",
header=T)
ts.plot(color)
acf(color)
pacf(color)
Based on the PACF plot, AR(1) seems to be a reasonable model. We can use forecast()
in the forecast
package to generate predictions as well as the prediction limits. In the output of arima()
or Arima()
, the confusing term intercept
refers to the mean.
# AR(1) looks reasonable
m1.color <- Arima(color, order=c(1, 0, 0))
plot(forecast(m1.color))
abline(h=m1.color$coef["intercept"], lty=2)
The data is the monthly milk production (pounds per cow) from January 1994 to December 2005.
milk <- read.table("http://homepage.divms.uiowa.edu/~kchan/TSA/Datasets/milk.dat",
header=T)
milk <- ts(milk$milk, freq=12, start=c(1994,1))
ts.plot(milk)
plot(stl(milk, s.window="periodic"))
See here for an excellent explanation of how to interpret the range bars in the stl plot. From the time series plot, we can see that the data is non-stationary, as there is a clear upward (linear) trend. We can use differencing to remove the linear trend.
milk_diff <- diff(milk)
ts.plot(milk_diff)
acf(milk_diff)
pacf(milk_diff)
eacf(milk_diff)
## AR/MA
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13
## 0 x x o x x x x x o x x x x x
## 1 o x o x o x o x o o o x o x
## 2 x o x x o x o x o o o x x o
## 3 x o o o o x x o o o o x x o
## 4 x x o o o o x o o o o x x x
## 5 o x o x x o o o o o o x o x
## 6 x o x x x o o o o o o x x o
## 7 x o o o o x o o o o o x x o
Note that ARMA models do not seem to be appropriate for the differenced data since both the acf plot and pacf plot do not exhibit decaying behavior. For simplicity, let’s just use an ARIMA(1, 1, 0) model for the milk production data.
The function funggcast()
comes from here by Frank Davenport. It requires two arguments: the data (training period and test period) and a forecast object.
funggcast <- function(dn,fcast){
require(zoo) #needed for the 'as.yearmon()' function
en<-max(time(fcast$mean)) #extract the max date used in the forecast
#Extract Source and Training Data
ds<-as.data.frame(window(dn,end=en))
names(ds)<-'observed'
ds$date<-as.Date(time(window(dn,end=en)))
#Extract the Fitted Values (need to figure out how to grab confidence intervals)
dfit<-as.data.frame(fcast$fitted)
dfit$date<-as.Date(time(fcast$fitted))
names(dfit)[1]<-'fitted'
ds<-merge(ds,dfit,all.x=T) #Merge fitted values with source and training data
#Exract the Forecast values and confidence intervals
dfcastn<-as.data.frame(fcast)
dfcastn$date<-as.Date(as.yearmon(row.names(dfcastn)))
names(dfcastn)<-c('forecast','lo80','hi80','lo95','hi95','date')
pd<-merge(ds,dfcastn,all.x=T) #final data.frame for use in ggplot
return(pd)
}
milk_train <- window(milk, end=2003.99)
milk_fit <- Arima(milk_train, order=c(1, 1, 0),
include.drift=T)
milk_fit
## Series: milk_train
## ARIMA(1,1,0) with drift
##
## Coefficients:
## ar1 drift
## -0.5649 2.2245
## s.e. 0.0764 3.4978
##
## sigma^2 estimated as 3604: log likelihood=-655.33
## AIC=1316.67 AICc=1316.88 BIC=1325
milk_forecast <- forecast(milk_fit)
milk_df <- funggcast(milk, milk_forecast)
The following code is from here.
ggplot_forecast <- function(pd) {
p1a<-ggplot(data=pd,aes(x=date,y=observed))
p1a<-p1a+geom_line(col='red')
p1a<-p1a+geom_line(aes(y=fitted),col='blue')
p1a<-p1a+geom_line(aes(y=forecast))+geom_ribbon(aes(ymin=lo95,ymax=hi95),alpha=.25)
p1a<-p1a+scale_x_date(date_breaks = "12 month", date_labels = "%b-%y")
p1a<-p1a+scale_y_continuous(name='Units of Y')
p1a<-p1a+theme(axis.text.x=element_text(size=10))
p1a<-p1a+ggtitle("Arima Fit to Simulated Data\n(black=forecast, blue=fitted, red=data, shadow=95% conf. interval)")
p1a
}
ggplot_forecast(milk_df)
## Warning: Removed 24 rows containing missing values (geom_path).
## Warning: Removed 120 rows containing missing values (geom_path).
From the time series plot, there seems to be seasonal patterns in the data. This might be one reason why the model fit and the predictions look poor.
We use periodogram to estimate the spectral density. The peak locations are the important frequencies in the data.
spec_diff <- periodogram(milk_diff)
Based on the periodogram, all the key frequencies have estimated spectral density greater than \(10^4\).
key_freq <- spec_diff$freq[which(spec_diff$spec > 10^4)]
Consider the following simple model:
\[\nabla Y_t = \sum_{j = 1}^m [A_j \cos(2\pi f_j t) + B_j \sin(2\pi f_j t)] + e_t \]
where \(m\) is the number of key frequencies, \(A_j\) and \(B_j\) are unknown constants, \(f_j\)’s are the key frequencies, \(\{e_t\}\) is normal white noise.
t <- 1:length(milk_diff)
harmonics <- do.call(cbind, lapply(key_freq, function(freq){
cbind(cos(2 * pi * freq * t), sin(2 * pi * freq * t))
}))
reslm <- lm(milk_diff ~ harmonics)
plot(t, milk_diff, type="l")
lines(fitted(reslm)~t, col=4, lty=2)
The fit looks really good. Let’s try using the model for predictions.
milk_train_diff <- diff(milk_train)
t_train <- 1:length(milk_train_diff)
spec_train_diff <- periodogram(milk_train_diff)
key_freq_train <- spec_train_diff$freq[which(spec_train_diff$spec > 10^4)]
harmonics_train <- do.call(cbind, lapply(key_freq_train, function(freq){
cbind(cos(2 * pi * freq * t_train), sin(2 * pi * freq * t_train))
}))
reslm <- lm(milk_train_diff ~ harmonics_train)
milk_diff_harmonic_forecast <-
forecast(reslm, newdata=data.frame(harmonics[(length(t_train) + 1):length(t), ]))
plot(milk_diff,
main="Linear regression with sinusoids\n on the differenced series")
lines(time(milk_diff)[(length(t_train) + 1):length(t)],
milk_diff_harmonic_forecast$mean, col=4,
lty=2)
milk_unfolded <- cumsum(c(milk[length(milk) - 24], milk_diff_harmonic_forecast$mean))
milk_unfolded <- milk_unfolded[2:length(milk_unfolded )]
milk_fitted_unfolded <- cumsum(c(milk[1], fitted(reslm)))
plot(milk,
main="Linear regression with sinusoids\n on the differenced series")
lines(time(milk)[1:(length(milk) - 24)],
milk_fitted_unfolded, col="red",
lty=2)
lines(time(milk)[(length(milk) - 23):length(milk)],
milk_unfolded, col=4,
lty=2)
It turns out we can extend ARIMA model to incorporate seasonality. For now, we simply use auto.arima()
to determine the appropriate seasonal ARIMA model.
milk_auto_fit <- auto.arima(milk_train)
milk_auto_fit
## Series: milk_train
## ARIMA(3,0,0)(2,1,1)[12] with drift
##
## Coefficients:
## ar1 ar2 ar3 sar1 sar2 sma1 drift
## 0.7359 0.1612 -0.0758 -0.2191 -0.2687 -0.6752 2.2202
## s.e. 0.0969 0.1236 0.0993 0.1345 0.1153 0.1389 0.1581
##
## sigma^2 estimated as 147.9: log likelihood=-427.45
## AIC=870.91 AICc=872.36 BIC=892.37
milk_auto_forecast <- forecast(milk_auto_fit)
milk_auto_df <- funggcast(milk, milk_auto_forecast)
ggplot_forecast(milk_auto_df)
## Warning: Removed 24 rows containing missing values (geom_path).
## Warning: Removed 120 rows containing missing values (geom_path).