In [1]:
import warnings
warnings.simplefilter('ignore')

import itertools

import pandas as pd
import numpy as np

import statsmodels.api as sm
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

from sklearn.metrics import mean_squared_error

%matplotlib inline
%config InlineBackend.figure_format='retina'
import matplotlib.pyplot as plt
import seaborn as sns

np.random.seed(42)
idx = pd.IndexSlice
In [2]:
!head -n2 data/H1.csv
IsCanceled,LeadTime,ArrivalDateYear,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,Babies,Meal,Country,MarketSegment,DistributionChannel,IsRepeatedGuest,PreviousCancellations,PreviousBookingsNotCanceled,ReservedRoomType,AssignedRoomType,BookingChanges,DepositType,Agent,Company,DaysInWaitingList,CustomerType,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,ReservationStatus,ReservationStatusDate
0,342,2015,July,27,1,0,0,2,0,0,BB       ,PRT,Direct,Direct,              0,0,0,C               ,C               ,3,No Deposit     ,       NULL,       NULL,0,Transient,0,0,0,Check-Out,2015-07-01
In [3]:
data = pd.read_csv('data/H1.csv')

data.info()
data.head(3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40060 entries, 0 to 40059
Data columns (total 31 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   IsCanceled                   40060 non-null  int64  
 1   LeadTime                     40060 non-null  int64  
 2   ArrivalDateYear              40060 non-null  int64  
 3   ArrivalDateMonth             40060 non-null  object 
 4   ArrivalDateWeekNumber        40060 non-null  int64  
 5   ArrivalDateDayOfMonth        40060 non-null  int64  
 6   StaysInWeekendNights         40060 non-null  int64  
 7   StaysInWeekNights            40060 non-null  int64  
 8   Adults                       40060 non-null  int64  
 9   Children                     40060 non-null  int64  
 10  Babies                       40060 non-null  int64  
 11  Meal                         40060 non-null  object 
 12  Country                      39596 non-null  object 
 13  MarketSegment                40060 non-null  object 
 14  DistributionChannel          40060 non-null  object 
 15  IsRepeatedGuest              40060 non-null  int64  
 16  PreviousCancellations        40060 non-null  int64  
 17  PreviousBookingsNotCanceled  40060 non-null  int64  
 18  ReservedRoomType             40060 non-null  object 
 19  AssignedRoomType             40060 non-null  object 
 20  BookingChanges               40060 non-null  int64  
 21  DepositType                  40060 non-null  object 
 22  Agent                        40060 non-null  object 
 23  Company                      40060 non-null  object 
 24  DaysInWaitingList            40060 non-null  int64  
 25  CustomerType                 40060 non-null  object 
 26  ADR                          40060 non-null  float64
 27  RequiredCarParkingSpaces     40060 non-null  int64  
 28  TotalOfSpecialRequests       40060 non-null  int64  
 29  ReservationStatus            40060 non-null  object 
 30  ReservationStatusDate        40060 non-null  object 
dtypes: float64(1), int64(17), object(13)
memory usage: 9.5+ MB
Out[3]:
IsCanceled LeadTime ArrivalDateYear ArrivalDateMonth ArrivalDateWeekNumber ArrivalDateDayOfMonth StaysInWeekendNights StaysInWeekNights Adults Children ... DepositType Agent Company DaysInWaitingList CustomerType ADR RequiredCarParkingSpaces TotalOfSpecialRequests ReservationStatus ReservationStatusDate
0 0 342 2015 July 27 1 0 0 2 0 ... No Deposit NULL NULL 0 Transient 0.0 0 0 Check-Out 2015-07-01
1 0 737 2015 July 27 1 0 0 2 0 ... No Deposit NULL NULL 0 Transient 0.0 0 0 Check-Out 2015-07-01
2 0 7 2015 July 27 1 0 1 1 0 ... No Deposit NULL NULL 0 Transient 75.0 0 0 Check-Out 2015-07-02

3 rows × 31 columns

In [4]:
ts = data['IsCanceled']
ts.index = pd.to_datetime(
    (data['ArrivalDateYear'].astype(str)
     + '-'
     + data['ArrivalDateMonth']
     + '-'
     + data['ArrivalDateDayOfMonth'].astype(str))
)

ts.head(2)
ts.tail(2)
Out[4]:
2015-07-01    0
2015-07-01    0
Name: IsCanceled, dtype: int64
Out[4]:
2017-08-31    0
2017-08-31    0
Name: IsCanceled, dtype: int64
In [5]:
ts = ts.resample('W-MON').sum()

ts.head(2)
ts.tail(2)
Out[5]:
2015-07-06    54
2015-07-13    56
Freq: W-MON, Name: IsCanceled, dtype: int64
Out[5]:
2017-08-28    153
2017-09-04     42
Freq: W-MON, Name: IsCanceled, dtype: int64
In [6]:
fig = plt.figure(constrained_layout=True, figsize=(14, 7))
gs = fig.add_gridspec(2, 2)

ax_top = fig.add_subplot(gs[0, :])
ax_bot_0 = fig.add_subplot(gs[1, 0])
ax_bot_1 = fig.add_subplot(gs[1, 1])

_ = ts.plot(ax=ax_top, title='Number of cancellations per week', label='')
_ = ts.rolling(window=4).mean().plot(ax=ax_top, label='4-period SMA')
_ = plot_acf(ts, ax=ax_bot_0, title='ACF')
_ = plot_pacf(ts, ax=ax_bot_1, title='PACF')

_ = ax_top.legend()

seasonal_decompose

In [7]:
decomposition = seasonal_decompose(ts, period=52)

fig, axs = plt.subplots(ncols=2, sharex=True, figsize=(14, 4))

_ = decomposition.trend.plot(ax=axs[0], title='Trend')
_ = decomposition.seasonal.plot(ax=axs[1], title='Seasonality')
# decomposition.resid

fig.tight_layout()

90/10 train-test split¶

In [8]:
len(ts)
round(len(ts) * .9), len(ts) - round(len(ts) * .9) 
Out[8]:
114
Out[8]:
(103, 11)
In [9]:
train, test = ts.iloc[:103], ts.iloc[103:]

len(train), len(test)
Out[9]:
(103, 11)

Modelling¶

In [10]:
%%time

# Perform grid search to determine the best parameters
p = d = q = range(0, 3)
b = 99_999
params = tuple()

pdq = itertools.product(p,d,q)
next(pdq)  # Skip (0, 0, 0)

for var in pdq:
    mod = ARIMA(train, order=var)
    res = mod.fit()

    if (res.bic <= b):
        b = res.bic
        params = var

params     
CPU times: user 9.87 s, sys: 131 ms, total: 10 s
Wall time: 3.61 s
Out[10]:
(0, 1, 1)
In [11]:
mod = ARIMA(train, order=(0, 1, 1))
res = mod.fit()

res.summary()
Out[11]:
SARIMAX Results
Dep. Variable: IsCanceled No. Observations: 103
Model: ARIMA(0, 1, 1) Log Likelihood -522.400
Date: Sun, 21 Feb 2021 AIC 1048.801
Time: 19:03:11 BIC 1054.051
Sample: 07-06-2015 HQIC 1050.927
- 06-19-2017
Covariance Type: opg
coef std err z P>|z| [0.025 0.975]
ma.L1 -0.7323 0.067 -10.915 0.000 -0.864 -0.601
sigma2 1632.2393 225.046 7.253 0.000 1191.157 2073.322
Ljung-Box (L1) (Q): 0.73 Jarque-Bera (JB): 1.34
Prob(Q): 0.39 Prob(JB): 0.51
Heteroskedasticity (H): 0.50 Skew: 0.28
Prob(H) (two-sided): 0.05 Kurtosis: 3.04


Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
In [12]:
forecast = res.get_forecast(len(test))
yhat = forecast.predicted_mean
yhat_conf_int = forecast.conf_int(alpha=0.05)

ci_lo = yhat_conf_int.iloc[:, 0]
ci_up = yhat_conf_int.iloc[:, 1]

ax = ts.plot(label='y', title='Number of cancellations per week', figsize=(12, 6))
_ = yhat.plot(ax=ax, ls='--', label='forecast')
_ = ax.fill_between(ci_lo.index, ci_lo, ci_up, alpha=0.2, color='lightgrey', label='95% CI')
legend = ax.legend(loc='upper left')

sns.despine()
ax.get_figure().tight_layout()

Calculate the root mean squared error for the predictions versus the actual number of cancellations in the test set.
Compare the RMSE value to the mean number of cancellations across the test set.

In [13]:
def rmse(y_true, y_pred):
    """ Root mean squared error. """
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.sqrt( np.mean((y_true - y_pred)**2) )


def mape(y_true, y_pred):
    """ Mean absolute prediction error. """
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
In [14]:
np.sqrt(mean_squared_error(test, yhat)) == rmse(test, yhat)
Out[14]:
True
In [15]:
rmse(test, yhat), rmse(test, test.mean())
mape(test, yhat)
Out[15]:
(38.18750253046791, 38.17770540620068)
Out[15]:
33.032347014314446

Other models¶

pmdarima

arima_model = pm.auto_arima(
    train,
    start_p=0, start_q=0,
    max_p=10, max_q=10,
    start_P=0, start_Q=0,
    max_P=10, max_Q=10,
    m=51,
    stepwise=True,
    seasonal=True,
    information_criterion='bic',
    trace=True,
    d=1, D=1,
    error_action='warn',
    suppress_warnings=True,
    random_state=20,
    n_fits=30)

⇉ Best model: ARIMA(2,1,0)(0,1,0)[51]

In [16]:
mod2 = ARIMA(train, order=(2, 1, 0))
res2 = mod2.fit()

res2.summary()
Out[16]:
SARIMAX Results
Dep. Variable: IsCanceled No. Observations: 103
Model: ARIMA(2, 1, 0) Log Likelihood -522.195
Date: Sun, 21 Feb 2021 AIC 1050.390
Time: 19:03:13 BIC 1058.265
Sample: 07-06-2015 HQIC 1053.579
- 06-19-2017
Covariance Type: opg
coef std err z P>|z| [0.025 0.975]
ar.L1 -0.7481 0.087 -8.637 0.000 -0.918 -0.578
ar.L2 -0.3270 0.103 -3.165 0.002 -0.530 -0.125
sigma2 1628.2704 206.099 7.900 0.000 1224.323 2032.218
Ljung-Box (L1) (Q): 0.07 Jarque-Bera (JB): 2.81
Prob(Q): 0.79 Prob(JB): 0.25
Heteroskedasticity (H): 0.60 Skew: 0.33
Prob(H) (two-sided): 0.14 Kurtosis: 3.47


Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
In [17]:
round(res.bic, 4), round(res2.bic, 4)
res.bic < res2.bic
Out[17]:
(1054.0508, 1058.2649)
Out[17]:
True

ARIMA docs

In [18]:
mod3 = ARIMA(train, order=(2, 1, 0), seasonal_order=(0, 1, 0, 51))
res3 = mod3.fit()

res3.summary()
Out[18]:
SARIMAX Results
Dep. Variable: IsCanceled No. Observations: 103
Model: ARIMA(2, 1, 0)x(0, 1, 0, 51) Log Likelihood -279.396
Date: Sun, 21 Feb 2021 AIC 564.793
Time: 19:03:14 BIC 570.588
Sample: 07-06-2015 HQIC 567.007
- 06-19-2017
Covariance Type: opg
coef std err z P>|z| [0.025 0.975]
ar.L1 -0.8131 0.163 -4.974 0.000 -1.134 -0.493
ar.L2 -0.3285 0.154 -2.131 0.033 -0.631 -0.026
sigma2 3312.0904 597.750 5.541 0.000 2140.522 4483.659
Ljung-Box (L1) (Q): 0.02 Jarque-Bera (JB): 0.80
Prob(Q): 0.88 Prob(JB): 0.67
Heteroskedasticity (H): 0.89 Skew: -0.19
Prob(H) (two-sided): 0.82 Kurtosis: 3.47


Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
In [19]:
round(res.bic, 4), round(res3.bic, 4)
res.bic < res3.bic
Out[19]:
(1054.0508, 570.5881)
Out[19]:
False
In [20]:
forecast = res3.get_forecast(len(test))
yhat = forecast.predicted_mean
yhat_conf_int = forecast.conf_int(alpha=0.05)

ci_lo = yhat_conf_int.iloc[:, 0]
ci_up = yhat_conf_int.iloc[:, 1]

ax = ts.plot(label='y', title='Number of cancellations per week', figsize=(12, 6))
_ = yhat.plot(ax=ax, ls='--', label='forecast')
_ = ax.fill_between(ci_lo.index, ci_lo, ci_up, alpha=0.2, color='lightgrey', label='95% CI')
legend = ax.legend(loc='upper left')

sns.despine()
ax.get_figure().tight_layout()
In [21]:
(rmse(test, res.get_forecast(len(test)).predicted_mean),
 rmse(test, test.mean()),
 rmse(test, res3.get_forecast(len(test)).predicted_mean))
Out[21]:
(38.18750253046791, 38.17770540620068, 63.342662387571124)