import warnings
warnings.simplefilter('ignore')

import itertools

import pandas as pd
import numpy as np

import statsmodels.api as sm
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

from sklearn.metrics import mean_squared_error

%matplotlib inline
%config InlineBackend.figure_format='retina'
import matplotlib.pyplot as plt
import seaborn as sns

np.random.seed(42)
idx = pd.IndexSlice


!head -n2 data/H1.csv

IsCanceled,LeadTime,ArrivalDateYear,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,Babies,Meal,Country,MarketSegment,DistributionChannel,IsRepeatedGuest,PreviousCancellations,PreviousBookingsNotCanceled,ReservedRoomType,AssignedRoomType,BookingChanges,DepositType,Agent,Company,DaysInWaitingList,CustomerType,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,ReservationStatus,ReservationStatusDate
0,342,2015,July,27,1,0,0,2,0,0,BB       ,PRT,Direct,Direct,              0,0,0,C               ,C               ,3,No Deposit     ,       NULL,       NULL,0,Transient,0,0,0,Check-Out,2015-07-01


data = pd.read_csv('data/H1.csv')

data.info()
data.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40060 entries, 0 to 40059
Data columns (total 31 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   IsCanceled                   40060 non-null  int64  
 1   LeadTime                     40060 non-null  int64  
 2   ArrivalDateYear              40060 non-null  int64  
 3   ArrivalDateMonth             40060 non-null  object 
 4   ArrivalDateWeekNumber        40060 non-null  int64  
 5   ArrivalDateDayOfMonth        40060 non-null  int64  
 6   StaysInWeekendNights         40060 non-null  int64  
 7   StaysInWeekNights            40060 non-null  int64  
 8   Adults                       40060 non-null  int64  
 9   Children                     40060 non-null  int64  
 10  Babies                       40060 non-null  int64  
 11  Meal                         40060 non-null  object 
 12  Country                      39596 non-null  object 
 13  MarketSegment                40060 non-null  object 
 14  DistributionChannel          40060 non-null  object 
 15  IsRepeatedGuest              40060 non-null  int64  
 16  PreviousCancellations        40060 non-null  int64  
 17  PreviousBookingsNotCanceled  40060 non-null  int64  
 18  ReservedRoomType             40060 non-null  object 
 19  AssignedRoomType             40060 non-null  object 
 20  BookingChanges               40060 non-null  int64  
 21  DepositType                  40060 non-null  object 
 22  Agent                        40060 non-null  object 
 23  Company                      40060 non-null  object 
 24  DaysInWaitingList            40060 non-null  int64  
 25  CustomerType                 40060 non-null  object 
 26  ADR                          40060 non-null  float64
 27  RequiredCarParkingSpaces     40060 non-null  int64  
 28  TotalOfSpecialRequests       40060 non-null  int64  
 29  ReservationStatus            40060 non-null  object 
 30  ReservationStatusDate        40060 non-null  object 
dtypes: float64(1), int64(17), object(13)
memory usage: 9.5+ MB


ts = data['IsCanceled']
ts.index = pd.to_datetime(
    (data['ArrivalDateYear'].astype(str)
     + '-'
     + data['ArrivalDateMonth']
     + '-'
     + data['ArrivalDateDayOfMonth'].astype(str))
)

ts.head(2)
ts.tail(2)

2015-07-01    0
2015-07-01    0
Name: IsCanceled, dtype: int64

2017-08-31    0
2017-08-31    0
Name: IsCanceled, dtype: int64


ts = ts.resample('W-MON').sum()

ts.head(2)
ts.tail(2)

2015-07-06    54
2015-07-13    56
Freq: W-MON, Name: IsCanceled, dtype: int64

2017-08-28    153
2017-09-04     42
Freq: W-MON, Name: IsCanceled, dtype: int64


fig = plt.figure(constrained_layout=True, figsize=(14, 7))
gs = fig.add_gridspec(2, 2)

ax_top = fig.add_subplot(gs[0, :])
ax_bot_0 = fig.add_subplot(gs[1, 0])
ax_bot_1 = fig.add_subplot(gs[1, 1])

_ = ts.plot(ax=ax_top, title='Number of cancellations per week', label='')
_ = ts.rolling(window=4).mean().plot(ax=ax_top, label='4-period SMA')
_ = plot_acf(ts, ax=ax_bot_0, title='ACF')
_ = plot_pacf(ts, ax=ax_bot_1, title='PACF')

_ = ax_top.legend()


decomposition = seasonal_decompose(ts, period=52)

fig, axs = plt.subplots(ncols=2, sharex=True, figsize=(14, 4))

_ = decomposition.trend.plot(ax=axs[0], title='Trend')
_ = decomposition.seasonal.plot(ax=axs[1], title='Seasonality')
# decomposition.resid

fig.tight_layout()


len(ts)
round(len(ts) * .9), len(ts) - round(len(ts) * .9)

114

(103, 11)


train, test = ts.iloc[:103], ts.iloc[103:]

len(train), len(test)

(103, 11)


%%time

# Perform grid search to determine the best parameters
p = d = q = range(0, 3)
b = 99_999
params = tuple()

pdq = itertools.product(p,d,q)
next(pdq)  # Skip (0, 0, 0)

for var in pdq:
    mod = ARIMA(train, order=var)
    res = mod.fit()

    if (res.bic <= b):
        b = res.bic
        params = var

params

CPU times: user 9.87 s, sys: 131 ms, total: 10 s
Wall time: 3.61 s

(0, 1, 1)


mod = ARIMA(train, order=(0, 1, 1))
res = mod.fit()

res.summary()


forecast = res.get_forecast(len(test))
yhat = forecast.predicted_mean
yhat_conf_int = forecast.conf_int(alpha=0.05)

ci_lo = yhat_conf_int.iloc[:, 0]
ci_up = yhat_conf_int.iloc[:, 1]

ax = ts.plot(label='y', title='Number of cancellations per week', figsize=(12, 6))
_ = yhat.plot(ax=ax, ls='--', label='forecast')
_ = ax.fill_between(ci_lo.index, ci_lo, ci_up, alpha=0.2, color='lightgrey', label='95% CI')
legend = ax.legend(loc='upper left')

sns.despine()
ax.get_figure().tight_layout()


def rmse(y_true, y_pred):
    """ Root mean squared error. """
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.sqrt( np.mean((y_true - y_pred)**2) )


def mape(y_true, y_pred):
    """ Mean absolute prediction error. """
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100


np.sqrt(mean_squared_error(test, yhat)) == rmse(test, yhat)

True


rmse(test, yhat), rmse(test, test.mean())
mape(test, yhat)

(38.18750253046791, 38.17770540620068)

33.032347014314446


mod2 = ARIMA(train, order=(2, 1, 0))
res2 = mod2.fit()

res2.summary()


round(res.bic, 4), round(res2.bic, 4)
res.bic < res2.bic

(1054.0508, 1058.2649)

True


mod3 = ARIMA(train, order=(2, 1, 0), seasonal_order=(0, 1, 0, 51))
res3 = mod3.fit()

res3.summary()


round(res.bic, 4), round(res3.bic, 4)
res.bic < res3.bic

(1054.0508, 570.5881)

False


forecast = res3.get_forecast(len(test))
yhat = forecast.predicted_mean
yhat_conf_int = forecast.conf_int(alpha=0.05)

ci_lo = yhat_conf_int.iloc[:, 0]
ci_up = yhat_conf_int.iloc[:, 1]

ax = ts.plot(label='y', title='Number of cancellations per week', figsize=(12, 6))
_ = yhat.plot(ax=ax, ls='--', label='forecast')
_ = ax.fill_between(ci_lo.index, ci_lo, ci_up, alpha=0.2, color='lightgrey', label='95% CI')
legend = ax.legend(loc='upper left')

sns.despine()
ax.get_figure().tight_layout()


(rmse(test, res.get_forecast(len(test)).predicted_mean),
 rmse(test, test.mean()),
 rmse(test, res3.get_forecast(len(test)).predicted_mean))

(38.18750253046791, 38.17770540620068, 63.342662387571124)

Dep. Variable:	IsCanceled	No. Observations:	103
Model:	ARIMA(0, 1, 1)	Log Likelihood	-522.400
Date:	Sun, 21 Feb 2021	AIC	1048.801
Time:	19:03:11	BIC	1054.051
Sample:	07-06-2015	HQIC	1050.927
	- 06-19-2017
Covariance Type:	opg

	coef	std err	z	P>\|z\|	[0.025	0.975]
ma.L1	-0.7323	0.067	-10.915	0.000	-0.864	-0.601
sigma2	1632.2393	225.046	7.253	0.000	1191.157	2073.322

Ljung-Box (L1) (Q):	0.73	Jarque-Bera (JB):	1.34
Prob(Q):	0.39	Prob(JB):	0.51
Heteroskedasticity (H):	0.50	Skew:	0.28
Prob(H) (two-sided):	0.05	Kurtosis:	3.04

Dep. Variable:	IsCanceled	No. Observations:	103
Model:	ARIMA(2, 1, 0)	Log Likelihood	-522.195
Date:	Sun, 21 Feb 2021	AIC	1050.390
Time:	19:03:13	BIC	1058.265
Sample:	07-06-2015	HQIC	1053.579
	- 06-19-2017
Covariance Type:	opg

	coef	std err	z	P>\|z\|	[0.025	0.975]
ar.L1	-0.7481	0.087	-8.637	0.000	-0.918	-0.578
ar.L2	-0.3270	0.103	-3.165	0.002	-0.530	-0.125
sigma2	1628.2704	206.099	7.900	0.000	1224.323	2032.218

90/10 train-test split¶

Modelling¶

Other models¶

	LeadTime	ArrivalDateYear	ArrivalDateMonth	ArrivalDateWeekNumber	ArrivalDateDayOfMonth	StaysInWeekNights	Adults	...	DepositType	Agent	Company	CustomerType	ADR	ReservationStatus	ReservationStatusDate
0	342	2015	July	27	1	0	2	...	No Deposit	NULL	NULL	Transient	0.0	Check-Out	2015-07-01
1	737	2015	July	27	1	0	2	...	No Deposit	NULL	NULL	Transient	0.0	Check-Out	2015-07-01
2	7	2015	July	27	1	1	1	...	No Deposit	NULL	NULL	Transient	75.0	Check-Out	2015-07-02

Ljung-Box (L1) (Q):	0.07	Jarque-Bera (JB):	2.81
Prob(Q):	0.79	Prob(JB):	0.25
Heteroskedasticity (H):	0.60	Skew:	0.33
Prob(H) (two-sided):	0.14	Kurtosis:	3.47

Ljung-Box (L1) (Q):	0.02	Jarque-Bera (JB):	0.80
Prob(Q):	0.88	Prob(JB):	0.67
Heteroskedasticity (H):	0.89	Skew:	-0.19
Prob(H) (two-sided):	0.82	Kurtosis:	3.47

	coef	std err	z	P>\|z\|	[0.025	0.975]
ar.L1	-0.8131	0.163	-4.974	0.000	-1.134	-0.493
ar.L2	-0.3285	0.154	-2.131	0.033	-0.631	-0.026
sigma2	3312.0904	597.750	5.541	0.000	2140.522	4483.659