import pandas as pd
import numpy as np
import pymc3 as pm

import arviz as az

import statsmodels.api as sm
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

from sklearn.metrics import mean_squared_error

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

np.random.seed(42)
idx = pd.IndexSlice


ts = pd.read_csv('data/h1weekly.csv', parse_dates=['Date'], index_col='Date', squeeze=True)

type(ts)
ts.head(2)
ts.tail(2)

_ = ts.plot(figsize=(12, 4), title='Number of cancellations per week')


ts.min(), ts.max()

(14, 222)


ts.describe().round(2)

count    115.00
mean      96.71
std       45.87
min       14.00
25%       64.00
50%       95.00
75%      127.50
max      222.00
Name: IsCanceled, dtype: float64


# Define priors (one's beliefs about mu and sigma) and generate 1,000 samples
mu_prior = 120
sigma_prior = 10

with pm.Model() as model:
    mu = pm.Normal('mu', mu=mu_prior, sigma=sigma_prior)
    sd = pm.HalfNormal('sd', sigma=sigma_prior)
    obs = pm.Normal('obs', mu=mu, sigma=sd, observed=ts)
    
    idata = pm.sample(1_000, return_inferencedata=True)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [sd, mu]

Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 26 seconds.


az.plot_trace(idata);


az.summary(idata)


az.plot_forest(idata, r_hat=True);


az.plot_posterior(idata);


with model:
    post_pred = pm.sample_posterior_predictive(idata.posterior)

# Add posterior predictive to the InferenceData
az.concat(idata, az.from_pymc3(posterior_predictive=post_pred), inplace=True)

/Users/dimitri/opt/miniconda3/envs/ts-forecasting/lib/python3.8/site-packages/arviz/data/io_pymc3.py:88: FutureWarning: Using `from_pymc3` without the model will be deprecated in a future release. Not using the model will return less accurate and less useful results. Make sure you use the model argument or call from_pymc3 within a model context.
  warnings.warn(


fig, ax = plt.subplots(figsize=(10, 6))
az.plot_ppc(idata, ax=ax)
ax.axvline(ts.mean(), ls='--', color='r', label='True mean')
ax.legend(fontsize=10);


with pm.Model() as ar1:
    # Effect of lagged value on current value
    theta = pm.Uniform('theta', lower=-1, upper=1)
    
    # Precision of the innovation term
    tau = pm.Gamma('tau', mu=1, sd=1)
    
    obs = pm.AR1('observed', k=theta, tau_e=tau, observed=ts)
    
    idata = pm.sample(2_000, return_inferencedata=True)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [tau, theta]

Sampling 4 chains for 1_000 tune and 2_000 draw iterations (4_000 + 8_000 draws total) took 57 seconds.


az.plot_posterior(idata);


az.summary(idata)


window_size = 4  # 4 weeks ~ 1 month
smoothed_ts = ts.rolling(window_size).mean()

smoothed_ts.head(2)
smoothed_ts.tail(2)

ax = ts.plot(figsize=(12, 4), title='Number of cancellations per week', label='')
ax = smoothed_ts.plot(ax=ax, label='4-period SMA')

_ = ax.legend()

Date
2015-06-21   NaN
2015-06-28   NaN
Name: IsCanceled, dtype: float64

Date
2017-08-20    159.50
2017-08-27    141.25
Name: IsCanceled, dtype: float64


with pm.Model() as ar1_smoothed:
    # Effect of lagged value on current value
    theta = pm.Uniform('theta', lower=-1, upper=1)
    
    # Precision of the innovation term
    tau = pm.Gamma('tau', mu=1, sd=1)
    
    obs = pm.AR1('observed', k=theta, tau_e=tau, observed=smoothed_ts.dropna())
    
    idata_smoothed = pm.sample(2_000, return_inferencedata=True)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [tau, theta]

Sampling 4 chains for 1_000 tune and 2_000 draw iterations (4_000 + 8_000 draws total) took 38 seconds.
There were 6 divergences after tuning. Increase `target_accept` or reparameterize.
There were 6 divergences after tuning. Increase `target_accept` or reparameterize.
There were 6 divergences after tuning. Increase `target_accept` or reparameterize.
There were 13 divergences after tuning. Increase `target_accept` or reparameterize.


_ = az.plot_posterior(idata_smoothed)
az.summary(idata_smoothed)


vol = ts.pct_change().dropna()

vol.describe().round(1)

_ = vol.plot(figsize=(12, 4), title='Time Series % change')

count    114.0
mean       0.2
std        1.0
min       -0.9
25%       -0.3
50%        0.0
75%        0.4
max        6.1
Name: IsCanceled, dtype: float64


with pm.Model() as vol_model:
    # The model remembers the datetime index with the name 'date'
    change_cancellations = pm.Data('cancellations', vol, dims='date', export_index_as_coords=True)

    nu = pm.Exponential('nu', 1 / 10.0, testval=5.0)
    sigma = pm.Exponential('sigma', 2.0, testval=0.1)

    # We can now figure out the shape of variables based on the index of the dataset
    s = pm.GaussianRandomWalk('s', sigma=sigma, dims='date')
    volatility_process = pm.Deterministic(
        'volatility_process', pm.math.exp(-2 * s) ** 0.5, dims='date'
    )

    r = pm.StudentT('r', nu=nu, sigma=volatility_process, observed=change_cancellations, dims='date')

vol_model.RV_dims

{'cancellations': ('date',),
 's': ('date',),
 'volatility_process': ('date',),
 'r': ('date',)}


with vol_model:
    idata = pm.sample(1_000, init='adapt_diag', return_inferencedata=False)

Auto-assigning NUTS sampler...
Initializing NUTS using adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [s, sigma, nu]

Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 133 seconds.

0, dim: date, 114 =? 114

The acceptance probability does not match the target. It is 0.9111600424611335, but should be close to 0.8. Try to increase the number of tuning steps.
The acceptance probability does not match the target. It is 0.9226335183755736, but should be close to 0.8. Try to increase the number of tuning steps.
The rhat statistic is larger than 1.05 for some parameters. This indicates slight problems during sampling.
The estimated number of effective samples is smaller than 200 for some parameters.


az.plot_trace(idata, var_names=('nu', 'sigma'));

/Users/dimitri/opt/miniconda3/envs/ts-forecasting/lib/python3.8/site-packages/arviz/data/io_pymc3.py:88: FutureWarning: Using `from_pymc3` without the model will be deprecated in a future release. Not using the model will return less accurate and less useful results. Make sure you use the model argument or call from_pymc3 within a model context.
  warnings.warn(

0, dim: date, 114 =? 114
0, dim: date, 114 =? 114


np.exp(idata['s', ::5].T).shape
np.exp(idata['s', ::5].T)[:, 0].shape

(114, 800)

(114,)


fig, ax = plt.subplots(figsize=(12, 6))

_ = ax.plot(vol.index, vol.values)
_ = ax.plot(vol.index, 1 / np.exp(idata['s', ::5].T), 'C1', alpha=0.03)
_ = ax.set(title='Volatility Process', xlabel='Date', ylabel='% change in cancellations')
_ = ax.legend(['% change in cancellations', 'Stochastic Volatility Process'], loc='upper right')


with pm.Model() as vol_model_2:
    # The model remembers the datetime index with the name 'date'
    cancellations = pm.Data('cancellations', ts, dims='date', export_index_as_coords=True)

    nu = pm.Exponential('nu', 1 / 10.0, testval=5.0)
    sigma = pm.Exponential('sigma', 2.0, testval=0.1)

    # We can now figure out the shape of variables based on the index of the dataset
    s = pm.GaussianRandomWalk('s', sigma=sigma, dims='date')
    volatility_process = pm.Deterministic(
        'volatility_process', pm.math.exp(-2 * s) ** 0.5, dims='date'
    )

    r = pm.StudentT('r', nu=nu, sigma=volatility_process, observed=cancellations, dims='date')
    
    idata = pm.sample(1_000, init='adapt_diag', return_inferencedata=False)

Auto-assigning NUTS sampler...
Initializing NUTS using adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [s, sigma, nu]

Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 156 seconds.

0, dim: date, 115 =? 115

The acceptance probability does not match the target. It is 0.6395079607417525, but should be close to 0.8. Try to increase the number of tuning steps.
The rhat statistic is larger than 1.05 for some parameters. This indicates slight problems during sampling.
The estimated number of effective samples is smaller than 200 for some parameters.


az.plot_trace(idata, var_names=('nu', 'sigma'));

/Users/dimitri/opt/miniconda3/envs/ts-forecasting/lib/python3.8/site-packages/arviz/data/io_pymc3.py:88: FutureWarning: Using `from_pymc3` without the model will be deprecated in a future release. Not using the model will return less accurate and less useful results. Make sure you use the model argument or call from_pymc3 within a model context.
  warnings.warn(

0, dim: date, 115 =? 115
0, dim: date, 115 =? 115


fig, ax = plt.subplots(figsize=(12, 6))

_ = ax.plot(ts.index, ts.values)
_ = ax.plot(ts.index, 1 / np.exp(idata['s', ::5].T), 'C1', alpha=0.03)
_ = ax.set(title='Volatility Process', xlabel='Date', ylabel='Cancellations')
_ = ax.legend(['Cancellations', 'Stochastic Volatility Process'], loc='upper right')

	mean	sd	hdi_3%	hdi_97%	mcse_mean	mcse_sd	ess_mean	ess_sd	ess_bulk	ess_tail	r_hat
mu	99.938	3.883	93.146	107.597	0.066	0.047	3486.0	3459.0	3500.0	2570.0	1.0
sd	43.028	2.603	38.215	47.823	0.045	0.032	3365.0	3332.0	3402.0	2453.0	1.0

	mean	sd	hdi_3%	hdi_97%	mcse_mean	mcse_sd	ess_mean	ess_sd	ess_bulk	ess_tail	r_hat
theta	0.878	0.041	0.804	0.958	0.001	0.0	4930.0	4797.0	4770.0	2860.0	1.0
tau	0.000	0.000	0.000	0.001	0.000	0.0	4762.0	4700.0	4776.0	4347.0	1.0

	mean	sd	hdi_3%	hdi_97%	mcse_mean	mcse_sd	ess_mean	ess_sd	ess_bulk	ess_tail	r_hat
theta	0.986	0.008	0.972	0.999	0.0	0.0	3952.0	3938.0	3271.0	2423.0	1.0
tau	0.005	0.001	0.004	0.006	0.0	0.0	4039.0	4039.0	3960.0	3238.0	1.0

数学建模

编程知识点

模型第三方库-Sympy

Bayesian Statistical Methods with PyMC3¶

In this notebook¶

Milestone 1: Generating Posterior Distributions with PyMC3¶

Sampling¶

Analyze sampling results¶

Posterior Predictive Sampling¶

Milestone 2: AR(1) Time Series Analysis¶

AR(1) model for `ts`¶

AR(1) model for smoothed `ts`¶

Milestone 3: Forecasting Volatility with Bayesian Time Series Methods¶

Volatility Model 1¶

Volatility Model 2¶

数学建模

编程知识点

模型第三方库-Sympy

Bayesian Statistical Methods with PyMC3¶

In this notebook¶

Milestone 1: Generating Posterior Distributions with PyMC3¶

Sampling¶

Analyze sampling results¶

Posterior Predictive Sampling¶

Milestone 2: AR(1) Time Series Analysis¶

AR(1) model for ts¶

AR(1) model for smoothed ts¶

Milestone 3: Forecasting Volatility with Bayesian Time Series Methods¶

Volatility Model 1¶

Volatility Model 2¶

AR(1) model for `ts`¶

AR(1) model for smoothed `ts`¶