Forecasting in R: Probability Bins for Time-Series Data

This time-series.R script, below, takes a set of historical time series data and does a walk using the forecast period to generate probabilistic outcomes from the data set.

Input file is a csv file with two columns (Date, Value) with dates in reverse chronological order and in ISO-8601 format. Like so:

2019-08-06,1.73                                                                
2019-08-05,1.75                                                                
2019-08-02,1.86

Output is as follows:

0.466: Bin 1 - <1.7
0.328: Bin 2 - 1.7 to <=1.9
0.144: Bin 3 - 1.9+ to <2.1
0.045: Bin 4 - 2.1 to <=2.3
0.017: Bin 5 - 2.3+

Note: Patterns in data sets will skew results. A 20-year upward trend will make higher probabilities more likely. A volatile 5-year period will produce more conservative predictions and may not capture recent trends or a recent change in direction of movement.

R Script

# time-series.R 
# Original: December 4, 2018
# Last revised: December 4, 2018

#################################################
# Description: This script is for running any 
# sequence of historical time-series data to make 
# a forecast for five values by a particular date.
# Assumes a cvs file with two columns (Date, Value) 
# with dates in reverse chronological order and in
# ISO-8601 format. Like so:
#
# 2019-08-06,1.73                                                                
# 2019-08-05,1.75                                                                
# 2019-08-02,1.86

#Clear memory and set string option for reading in data:
rm(list=ls())
gc()

  #################################################
  # Function
  time-series <- function(time_path="./path/file.csv", 
                        closing_date="2020-01-01", trading_days=5, 
                         bin1=1.7, bin2=1.9, 
                         bin3=2.1, bin4=2.3) {

  #################################################
  # Libraries
  #
  # Load libraries. If library X is not installed
  # you can install it with this command at the R prompt:
  # install.packages('X') 

  # Determine how many days until end of question
  todays_date <- Sys.Date()
  closing_date <- as.Date(closing_date)
  remaining_weeks <- as.numeric(difftime(closing_date, todays_date, units = "weeks"))
  remaining_weeks <- round(remaining_weeks, digits=0)
  non_trading_days <- (7 - trading_days) * remaining_weeks
  day_difference <- as.numeric(difftime(closing_date, todays_date))
  remaining_days <- day_difference - non_trading_days 

  #################################################
  # Import & Parse
  # Point to time series data file and import it.
  time_import <- read.csv(time_path, header=FALSE) 
  colnames(time_import) <- c("date", "value")

  # Setting data types
  time_import$date <- as.Date(time_import$date)
  time_import$value <- as.vector(time_import$value)

  # Setting most recent value, assuming descending data
  current_value <- time_import[1,2]

  # Get the length of time_import$value and shorten it by remaining_days
  time_rows = length(time_import$value) - remaining_days

  # Create a dataframe
  time_calc <- NULL

  # Iterate through value and subtract the difference 
  # from the row remaining days away.
  for (i in 1:time_rows) {
    time_calc[i] <- time_import$value[i] - time_import$value[i+remaining_days]
  }

  # Adjusted against current values to match time_calc
  adj_bin1 <- bin1 - current_value
  adj_bin2 <- bin2 - current_value
  adj_bin3 <- bin3 - current_value 
  adj_bin4 <- bin4 - current_value 

  # Determine how many trading days fall in each question bin
  prob1 <- round(sum(time_calc<adj_bin1)/length(time_calc), digits = 3)
  prob2 <- round(sum(time_calc>=adj_bin1 & time_calc<=adj_bin2)/length(time_calc), digits = 3)
  prob3 <- round(sum(time_calc>adj_bin2 & time_calc<adj_bin3)/length(time_calc), digits = 3)
  prob4 <- round(sum(time_calc>=adj_bin3 & time_calc<=adj_bin4)/length(time_calc), digits = 3)
  prob5 <- round(sum(time_calc>adj_bin4)/length(time_calc), digits = 3)
  
  ###############################################
  # Print results
  return(cat(paste0(prob1, ": Bin 1 - ", "<", bin1, "\n",
                  prob2, ": Bin 2 - ", bin1, " to <=", bin2, "\n", 
                  prob3, ": Bin 3 - ", bin2, "+ to <", bin3, "\n", 
                  prob4, ": Bin 4 - ", bin3, " to <=", bin4, "\n", 
                  prob5, ": Bin 5 - ", bin4, "+", "\n")))
}

Forecasting with R Script: Graph of WHO Flu Data

# flu.R
# Original: November 2, 2018
# Last revised: December 3, 2018 

#################################################
# Prep: Go to the WHO Flumart website:
# http://apps.who.int/flumart/Default?ReportNo=12
# Select Year 2000 Week 1 to current year, week 52.
# Save file to the data directory. Change weeks below. 

#################################################
# Description:
# Script parses cvs file and provides a graph 
# with selectable yearly trend lines for 
# comparison, also includes analysis options at 
# bottom for predicting a particular day or 
# searching by cases.

# Clear memory
rm(list=ls())
gc()

#################################################
# Set Variables in Function
flu <- function(flu_file="./data/FluNetInteractiveReport.csv",
                week_start=1, week_end=52) {

  #################################################
  #PRELIMINARIES

  #Define basepath and set working directory:
  basepath = "~/Documents/programs/R/forecasting"
  setwd(basepath)

  #Preventing scientific notation in graphs
  options(scipen=999)

  #################################################
  #Libraries 
  # If library X is not installed, you can install 
  # it with this command: install.packages('X')
  library(plyr)
  library(tidyr)
  library(data.table)
  library(lubridate)
  library(stringr)
  library(ggplot2)
  library(dplyr)
  library(reshape2)
  library(corrplot)
  library(hydroGOF)
  library(Hmisc)
  library(forecast)
  library(tseries)

  #################################################
  # Import & Parse
  # Point to downloaded flu data file, variable is above.
  flumart <- read.csv(flu_file, skip=3, header=TRUE) 

  # Drop all the columns but the ones of interest
  flumart <- flumart[ -c(2,3,6:19,21) ]

  # Assign column names to something more reasonable
  colnames(flumart) <- c("Country", "Year", "Week", "Confirmed_Flu", "Prevalance")  

  # Assign the country variable from first column, second row
  country <- flumart[c(1),c(1)]

  # Incomplete years mess up correlation matrix
  flu_table <- filter(flumart, Year >= 2000)

  # Drop the non-numerical columns
  flu_table <- flu_table[,-c(1,5)]

  # Reshape the table into grid
  flu_table <- reshape(flu_table, direction="wide", idvar="Week", timevar="Year")

  # Fix column names after reshaping
  names(flu_table) <- gsub("Confirmed_Flu.", "", names(flu_table))
  
  # Put into matrix for correlations
  flu_table <- as.matrix(flu_table[,-c(1,5)])

  #################################################
  # Correlate & Plot
  flu_rcorr <- rcorr(flu_table)
  flu_coeff <- flu_rcorr$r
  flu_p <- flu_rcorr$P

  flu_matches <- flu_coeff[,ncol(flu_coeff)]
  flu_matches <- sort(flu_matches, decreasing = TRUE)
  flu_matches <- names(flu_matches)
  current_year <- as.numeric(flu_matches[1])
  matching_year1 <- as.numeric(flu_matches[2])
  matching_year2 <- as.numeric(flu_matches[3])
  matching_year3 <- as.numeric(flu_matches[4])
  matching_year4 <- as.numeric(flu_matches[5])
  matching_year5 <- as.numeric(flu_matches[6])

  #################################################
  # Prediction using ARIMA

  flu_data <- flumart # Importing initial flu data
  flu_data <- filter(flu_data, Week <= 52)
  flu_data <- flu_data[, -c(1:3,5)] # Remove Year & Week
  flu_ts <- ts(flu_data, start = 1, frequency=52) # ARIMA needs time series
  flu_data <- as.vector(flu_data)
  flu_fit <- auto.arima(flu_ts, D=1) 
  flu_pred <- forecast(flu_fit, h=52)
  flu_plot <- as.data.frame(Predicted_Mean <- (flu_pred$mean))
  flu_plot$Week <- as.numeric(1:nrow(flu_plot))
  
  flu_prediction <- ggplot() + 
    ggtitle("Predicted Flu Incidence") +
    geom_line(data = flu_plot, aes(x = Week, y = Predicted_Mean)) + 
    scale_x_continuous() + scale_y_continuous()

  #################################################
  # Graph

  # Creating a temp variable for graph
  flu_graph <- flumart

  # Filtering results for 5 year comparison, against 5 closest correlated years 
  flu_graph <- filter(flu_graph, Year == current_year | Year == matching_year1 |
                      Year == matching_year2 | Year == matching_year3 | 
                      Year ==matching_year4 | Year == matching_year5)

  # These variables need to be numerical
  flu_graph$Week <- as.numeric(flu_graph$Week)
  flu_graph$Confirmed_Flu <- as.numeric(flu_graph$Confirmed_Flu)

  # Limit to weeks of interest
  flu_graph <- filter(flu_graph, Week >= week_start)
  flu_graph <- filter(flu_graph, Week <= week_end)  

  # The variable used to color and split the data should be a factor so lines are properly drawn
  flu_graph$Year <- factor(flu_graph$Year)

  # Lays out sea_graph by day with a colored line for each year
  flu_compare <- ggplot() + 
    ggtitle(paste("Confirmed Flu in", country)) +
    geom_line(data = flu_graph, aes(x = Week, y = Confirmed_Flu, color = Year)) + 
    geom_line(data = flu_plot, aes(x = Week, y = Predicted_Mean, color="Forecast"))
    scale_x_continuous()

  flu_week <- flu_graph[nrow(flu_graph),3]
  flu_year <- flu_graph[nrow(flu_graph),2]
  summary(flu_graph)

  ###############################################
  # Printing
  # Creating a cvs file of changed data
  write.csv(flu_table, file=paste0("./output/flu-in-", country, "-table-",
                                   flu_year, "-week-", flu_week, ".csv")) 
  
  # Print flu_compare to screen
  flu_compare
  
  # Print flu
  ggsave(filename=paste0("./output/flu-in-", country, 
                         "-prediction-", flu_year, "-week-", flu_week, ".pdf"), plot=flu_prediction)
  
  # Print flu_plot to PDF
  ggsave(filename=paste0("./output/flu-in-", country, 
                         "-compare-", flu_year, "-week-", flu_week, ".pdf"), plot=flu_compare)
  
  # Print correlation matrix
  pdf(paste0("./output/flu-in-", country, "-correlation-", 
             flu_year, "-week-", flu_week, ".pdf"))
  corrplot(flu_coeff, method="pie", type="lower")
  dev.off()
    
  return(flu_graph[nrow(flu_graph),])
}