knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(lubridate)
library(mgcv)
library(scales)
Melbourne, Australia (January 2019)

Melbourne, Australia (January 2019)

Daily mean temperature trend analysis for Melbourne, Australia

I model mean daily temperature from January 1, 1995 to May 18, 2019 of Melbourne, Australia to find evidence of long term temperature trend. Using data from the Average Daily Temperature Archive, I build a mixed additive model of daily mean temperature with two smoothers—one for a possible trend, and one for seasonal effects–along with AR(2) residuals to capture daily persistence of weather shocks. My rough model is broad brush-stroke example of how to use additive models with mgcv in R while at the same time revealing climate change temperature trends in Melbourne, Australia. My model is based on section 7.7.2 of Simon Wood’s book Generalized Additive Models: An Introduction with R, Second Edition, a book I highly recommend for all data scientists.

A Generalized Linear Mixed Model (GLMM) takes the form: \[\begin{align}\boldsymbol{\mu}^b &= \mathbb{E}({\bf y}\mid{\bf b}), \\ g(\mu_i^b) &= {\bf X}_i\boldsymbol{\beta} + {\bf Z}_i{\bf b}, \\ {\bf b} &\sim N({\bf 0}, {\boldsymbol{\psi}}_\theta), \\ y_i\mid{\bf b} &\sim \text{exponential family dist.}\end{align}\] A generalized additive model uses this structure, but the design matrix \({\bf X}\) is built from spline evaluations with a “wiggliness” penalty, not on the regressors directly (coefficients correspond to the coefficients of the spline). For details, see Generalized Additive Models: An Introduction with R, Second Edition.

The additive model that I use is:

\[\begin{equation} \text{temp}_i = s_1(\text{time.of.year}_i) + s_2(\text{time}_i) + e_i\end{equation},\] where, \(e_i = \phi_1 e_{i-1} + \phi_2 e_{i-2}+ \epsilon_i\), \(\epsilon_1 \sim N(0,\sigma^2)\), \(s_1(\cdot)\) is a cyclic cubic smoothing spline that captures seasonal temperature variation, and \(s_2(\cdot)\) is a smoothing spline that tracks a temperature trend, if any. Below is my code implementation.

melbourne_temp<- read_table(file = "http://academic.udayton.edu/kissock/http/Weather/gsod95-current/AUMELBRN.txt", col_names = F) %>% 
  setNames(c("month", "day", "year", "temp"))%>% 
  mutate(date = str_c(year,"-",month,"-",day)) %>% 
  mutate(date = as_date(date)) %>% 
  mutate(day_num = yday(date))

#The last two days of 2015 (Dec 30 and Dec 31) have the wrong numeric month: 2 instead of 12. 
#The above code fails to parse those two dates. Need to fix them.

melbourne_temp[which(is.na(melbourne_temp$date)),]$month<-12 # fix the December mislabelled months

#Remake tibble with corrected date information
melbourne_temp<- melbourne_temp %>% 
  mutate(date = str_c(year,"-",month,"-",day)) %>% 
  mutate(date = as_date(date)) %>% 
  mutate(day_num = yday(date))
#missing temperature measurements encoded as -99 F
melbourne_temp<- melbourne_temp %>% mutate(temp = ifelse(temp == -99,NA,temp))

imp_fun<- function(dF){
  #linear approximation for imputing missing data
  temp_fun<- approxfun(dF$time, dF$temp)
  dF$temp<- temp_fun(dF$time)
  return(dF)
  
}

#remove February 29 so that the cc-spline has a period of 365 days. 
#Probably can come up with better ways of handling leaps years, but close enough for government work ;)

melbourne_temp<- melbourne_temp %>% mutate(time = as.numeric(date)) %>% filter(month !=2|day !=29) %>% imp_fun()


#additive model with nested AR(2) residuals
#trend smooth as a cubic regression spline; 
#seasonal smooth as a cyclic cubic regression spline
melbourne_gamm<- gamm(temp ~ s(day_num, bs ="cc", k=20) +s(time, bs="cr"), 
                  data = melbourne_temp, correlation = corARMA(form=~1|year, p=2))
raw_temp_plot<- ggplot(melbourne_temp, aes(date, (temp-32)*5/9)) + geom_point() + theme_bw() + ylab("Daily mean temperature (C)") +xlab("Date") + scale_y_continuous(breaks = pretty_breaks(10)) + scale_x_date(breaks = pretty_breaks(20)) + ggtitle("Melbourne, Australia") 



pp<- predict(melbourne_gamm$gam, type = "terms", se.fit = TRUE)#get gamm data

#trend smooth component
melbourne_temp$smooth<- ((pp[[1]][,2] +coef(melbourne_gamm$gam)[1])-32)*5/9
melbourne_temp$up<- ((pp[[1]][,2] + pp[[2]][,2] +coef(melbourne_gamm$gam)[1])-32)*5/9
melbourne_temp$down<- ((pp[[1]][,2] - pp[[2]][,2]+coef(melbourne_gamm$gam)[1])-32)*5/9

#cyclic smooth component
melbourne_temp$smooth_cc<- ((pp[[1]][,1] +coef(melbourne_gamm$gam)[1])-32)*5/9
melbourne_temp$up_cc<- ((pp[[1]][,1] + pp[[2]][,1] +coef(melbourne_gamm$gam)[1])-32)*5/9
melbourne_temp$down_cc<- ((pp[[1]][,1] - pp[[2]][,1]+coef(melbourne_gamm$gam)[1])-32)*5/9


#plots include the intercept term

trend_plot<- ggplot(melbourne_temp, aes(date, smooth)) + geom_line() + theme_bw() + scale_x_date(breaks = pretty_breaks(20)) + scale_y_continuous(breaks = pretty_breaks(20)) +
  geom_ribbon(aes(ymin = down, ymax = up), alpha = .4) + ggtitle("Melbourne, Australia: Long term temperature trend") + xlab("Date") + ylab("Daily mean temperature (C)") 

melbourne_temp_season<- melbourne_temp %>% filter(date>=as_date("1995-01-01") & date< as_date("1996-01-01")) 

seasonal_plot<- ggplot(melbourne_temp_season, aes(date, smooth_cc)) + geom_line() + theme_bw() + scale_x_date(breaks = pretty_breaks(12), labels = date_format("%b")) + scale_y_continuous(breaks = pretty_breaks(20)) +
  geom_ribbon(aes(ymin = down_cc, ymax = up_cc), alpha = .4) + ggtitle("Melbourne, Australia: Annual cycle") + coord_cartesian(expand = FALSE, ylim = c(8,22)) + xlab("Date") + ylab("Daily mean temperature (C)") 


print(raw_temp_plot)

print(trend_plot)

print(seasonal_plot)