! pip3 install pmdarima --user

Looking in indexes: https://mirrors.163.com/pypi/simple/
Requirement already satisfied: pmdarima in /home/fli/.local/lib/python3.9/site-packages (1.8.3)
Requirement already satisfied: pandas>=0.19 in /home/fli/.local/lib/python3.9/site-packages (from pmdarima) (1.3.4)
Requirement already satisfied: scipy>=1.3.2 in /usr/lib/python3/dist-packages (from pmdarima) (1.7.1)
Requirement already satisfied: urllib3 in /usr/lib/python3/dist-packages (from pmdarima) (1.26.5)
Requirement already satisfied: joblib>=0.11 in /usr/lib/python3/dist-packages (from pmdarima) (0.17.0)
Requirement already satisfied: numpy>=1.19.3 in /usr/lib/python3/dist-packages (from pmdarima) (1.19.5)
Requirement already satisfied: Cython!=0.29.18,>=0.29 in /usr/lib/python3/dist-packages (from pmdarima) (0.29.24)
Requirement already satisfied: statsmodels!=0.12.0,>=0.11 in /usr/lib/python3/dist-packages (from pmdarima) (0.12.2)
Requirement already satisfied: setuptools!=50.0.0,>=38.6.0 in /usr/lib/python3/dist-packages (from pmdarima) (58.2.0)
Requirement already satisfied: scikit-learn>=0.22 in /usr/lib/python3/dist-packages (from pmdarima) (0.23.2)
Requirement already satisfied: python-dateutil>=2.7.3 in /usr/lib/python3/dist-packages (from pandas>=0.19->pmdarima) (2.8.1)
Requirement already satisfied: pytz>=2017.3 in /usr/lib/python3/dist-packages (from pandas>=0.19->pmdarima) (2021.3)


import os
import sys
import math
import warnings
import itertools
import numpy as np
import pandas as pd
# import scrapbook as sb
import matplotlib.pyplot as plt

from pmdarima.arima import auto_arima

pd.options.display.float_format = "{:,.2f}".format
np.set_printoptions(precision=2)
warnings.filterwarnings("ignore")

print("System version: {}".format(sys.version))

System version: 3.9.7 (default, Sep 24 2021, 09:43:00) 
[GCC 10.3.0]


# Forecasting settings
N_SPLITS = 1
HORIZON = 2
GAP = 2
FIRST_WEEK = 40
LAST_WEEK = 138

# Parameters of ARIMA model
params = {
    "seasonal": False,
    "start_p": 0,
    "start_q": 0,
    "max_p": 5,
    "max_q": 5,
    "m": 52,
}


train_df = pd.read_csv("data/OrangeJuice_train.csv")
test_df = pd.read_csv("data/OrangeJuice_test.csv")


# Select only required columns
train_df = train_df[["store", "brand", "week", "logmove"]]
train_df


def df_from_cartesian_product(dict_in):
    """Generate a Pandas dataframe from Cartesian product of lists."""
    from itertools import product

    cart = list(product(*dict_in.values()))
    df = pd.DataFrame(cart, columns=dict_in.keys())
    return df

def complete_and_fill_df(df, stores, brands, weeks):
    """Completes missing rows in Orange Juice datasets and fills in the missing values.
    """
    d = {"store": stores, "brand": brands, "week": weeks}
    data_grid = df_from_cartesian_product(d)
    # Complete all rows
    df_filled = pd.merge(data_grid, df, how="left", on=["store", "brand", "week"])
    # Fill in missing values
    df_filled = df_filled.groupby(["store", "brand"]).apply(lambda x: x.fillna(method="ffill").fillna(method="bfill"))
    return df_filled


# Create a dataframe to hold all necessary data
store_list = train_df["store"].unique()
brand_list = train_df["brand"].unique()
train_week_list = range(FIRST_WEEK, LAST_WEEK - (HORIZON - 1) - (GAP - 1))

train_filled = complete_and_fill_df(train_df, stores=store_list, brands=brand_list, weeks=train_week_list)
train_filled


# Evaluate prediction accuracy
test_df["actuals"] = test_df.logmove.apply(lambda x: round(math.exp(x)))
test_df = test_df[["store", "brand", "week", "actuals"]]

test_week_list = range(LAST_WEEK - HORIZON + 1, LAST_WEEK + 1)
test_filled = complete_and_fill_df(test_df, stores=store_list, brands=brand_list, weeks=test_week_list)

test_filled


STORE = 2
BRAND = 6

train_ts = train_filled.loc[(train_filled.store == STORE) & (train_filled.brand == BRAND)]
train_ts.tail(10)


train_ts = np.array(train_ts.logmove)

model = auto_arima(
    train_ts,
    seasonal=params["seasonal"],
    start_p=params["start_p"],
    start_q=params["start_q"],
    max_p=params["max_p"],
    max_q=params["max_q"],
    stepwise=True,
)

model.fit(train_ts)

ARIMA(order=(1, 0, 0), scoring_args={}, suppress_warnings=True)


model.summary()


model.plot_diagnostics(figsize=(10, 8))
plt.show()


preds = model.predict(n_periods=GAP + HORIZON - 1)

predictions = np.round(np.exp(preds[-HORIZON:]))
pred_df = pd.DataFrame({"predictions": predictions, "store": STORE, "brand": BRAND, "week": test_week_list})

pred_df


# Combine actual units and predictions
test_ts = test_filled.loc[(test_filled.store == STORE) & (test_filled.brand == BRAND)]

combined = pd.merge(pred_df, test_ts, on=["store", "brand", "week"], how="left")
combined


def MAPE(predictions, actuals):
    """
    Implements Mean Absolute Percent Error (MAPE).

    Args:
        predictions (array like): a vector of predicted values.
        actuals (array like): a vector of actual values.

    Returns:
        numpy.float: MAPE value
    """
    if not (isinstance(actuals, pd.Series) and isinstance(predictions, pd.Series)):
        predictions, actuals = pd.Series(predictions), pd.Series(actuals)

    return ((predictions - actuals).abs() / actuals).mean()


metric_value = MAPE(combined.predictions, combined.actuals) * 100

print(f"MAPE of the forecasts is {metric_value}%")

MAPE of the forecasts is 69.31423611111111%


def train_store_brand(data, store, brand):
    train_ts = data.loc[(data.store == store) & (data.brand == brand)]
    train_ts = np.array(train_ts["logmove"])

    model = auto_arima(
        train_ts,
        seasonal=params["seasonal"],
        start_p=params["start_p"],
        start_q=params["start_q"],
        max_p=params["max_p"],
        max_q=params["max_q"],
        stepwise=True,
        error_action="ignore",
    )

    model.fit(train_ts)
    preds = model.predict(n_periods=GAP + HORIZON - 1)
    predictions = np.round(np.exp(preds[-HORIZON:]))

    pred_df = pd.DataFrame({"predictions": predictions, "store": store, "brand": brand, "week": test_week_list})
    test_ts = test_filled.loc[(test_filled.store == store) & (test_filled.brand == brand)]

    return pd.merge(pred_df, test_ts, on=["store", "brand", "week"], how="left")


%%time
    
from datetime import datetime
    
# Just train a few stores to save time
store_list = store_list[0:3]

result_df = pd.DataFrame(None, columns=["predictions", "store", "brand", "week", "actuals"])

print("Training ARIMA model...")
for store, brand in itertools.product(store_list, brand_list):
    if brand == 1:
        print(f"{datetime.now().time()} - Forecasting for store: {store}")

    combined_df = train_store_brand(train_filled, store, brand)
    result_df = result_df.append(combined_df, ignore_index=True)

Training ARIMA model...
12:29:25.881911 - Forecasting for store: 2
12:29:44.320624 - Forecasting for store: 5
12:29:55.796922 - Forecasting for store: 8
CPU times: user 2min 31s, sys: 6.6 s, total: 2min 38s
Wall time: 40.4 s


metric_value

69.31423611111111


metric_value = MAPE(result_df.predictions, result_df.actuals) * 100
# sb.glue("MAPE", metric_value)

print(f"MAPE of the forecasts is {metric_value} %")

MAPE of the forecasts is 68.78729124622176 %


num_samples = 6
min_week = 120
sales = pd.read_csv("data/yx.csv")
sales["move"] = sales.logmove.apply(lambda x: round(math.exp(x)) if x > 0 else 0)

result_df["move"] = result_df.predictions
from data.plot import plot_predictions_with_history

plot_predictions_with_history(
    result_df,
    sales,
    grain1_unique_vals=store_list,
    grain2_unique_vals=brand_list,
    time_col_name="week",
    target_col_name="move",
    grain1_name="store",
    grain2_name="brand",
    min_timestep=min_week,
    num_samples=num_samples,
    predict_at_timestep=max(train_df.week),
    line_at_predict_time=True,
    title="Prediction results for a few sample time series (predictions are made at week 135)",
    x_label="week",
    y_label="unit sales",
    random_seed=2,
)

	store	brand	week	logmove
0	2	1	40	9.02
1	2	1	46	8.72
2	2	1	47	8.25
3	2	1	48	8.99
4	2	1	50	9.09
...	...	...	...	...
84178	137	11	131	9.63
84179	137	11	132	9.70
84180	137	11	133	9.00
84181	137	11	134	8.91
84182	137	11	135	9.90

	store	brand	week	logmove
0	2	1	40	9.02
1	2	1	41	9.02
2	2	1	42	9.02
3	2	1	43	9.02
4	2	1	44	9.02
...	...	...	...	...
87643	137	11	131	9.63
87644	137	11	132	9.70
87645	137	11	133	9.00
87646	137	11	134	8.91
87647	137	11	135	9.90

	store	brand	week	actuals
0	2	1	137	9792
1	2	1	138	16960
2	2	2	137	6240
3	2	2	138	14784
4	2	3	137	1920
...	...	...	...	...
1821	137	9	138	384
1822	137	10	137	40384
1823	137	10	138	7232
1824	137	11	137	7424
1825	137	11	138	6144

	store	brand	week	logmove
566	2	6	126	8.52
567	2	6	127	8.03
568	2	6	128	8.15
569	2	6	129	8.03
570	2	6	130	7.74
571	2	6	131	7.45
572	2	6	132	7.70
573	2	6	133	7.93
574	2	6	134	7.27
575	2	6	135	6.96

Dep. Variable:	y	No. Observations:	96
Model:	SARIMAX(1, 0, 0)	Log Likelihood	-18.335
Date:	Fri, 12 Nov 2021	AIC	42.670
Time:	12:27:01	BIC	50.363
Sample:	0	HQIC	45.779
	- 96
Covariance Type:	opg

Automated ARIMA forecasting with Python¶

ARIMA: Autoregressive Integrated Moving Average¶

Global Settings and Imports¶

Parameters¶

Data Preparation¶

Process training data¶

Process test data¶

Model training¶

Model evaluation¶

Model training for all stores and brands¶

Additional Reading¶

	coef	std err	z	P>\|z\|	[0.025	0.975]
intercept	3.5490	0.686	5.172	0.000	2.204	4.894
ar.L1	0.5579	0.086	6.468	0.000	0.389	0.727
sigma2	0.0855	0.012	6.973	0.000	0.061	0.109

Ljung-Box (L1) (Q):	0.21	Jarque-Bera (JB):	1.39
Prob(Q):	0.64	Prob(JB):	0.50
Heteroskedasticity (H):	1.48	Skew:	-0.28
Prob(H) (two-sided):	0.27	Kurtosis:	3.15