Tutorial PyCaret¶

  • link
  • link
In [1]:
import time
import glob
import os
import numpy as np
import pandas as pd
import geopandas as gpd
import plotly
import plotly.graph_objects as go

from pycaret.datasets import get_data
#from pycaret.internal.pycaret_experiment import TimeSeriesExperiment
from pycaret.time_series import TSForecastingExperiment

plotly.offline.init_notebook_mode()
In [2]:
data_path = os.path.join(os.path.expanduser('~'),'ILAB_DATA','ISTAT', 'DATA')

Caricamento dei dati¶

In [3]:
# 25min
comuni_all_pop = pd.read_parquet(os.path.join(data_path,'comuni_all_pop.parquet'))
comuni_all = gpd.read_parquet(os.path.join(data_path,'comuni_all.parquet'))
comuni_all_pop['TIME_PERIOD'] = comuni_all_pop['TIME_PERIOD'].astype(int)
comuni_all_pop = comuni_all_pop.merge(comuni_all[['pro_com_t','anno','comune']], left_on=['REF_AREA','TIME_PERIOD'], right_on=['pro_com_t','anno'])
In [42]:
# Ciampino 058118
# Roma 058091
gdf = comuni_all_pop.loc[comuni_all_pop['REF_AREA']=='058118',['TIME_PERIOD','total']].copy()

Conversione in datetime e ordinamento¶

In [43]:
gdf['data'] = pd.to_datetime(gdf['TIME_PERIOD'], format='%Y')
gdf = gdf.sort_values(by='data', ascending=True)

Visualizzazione dati¶

In [44]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=gdf['data'], y=gdf['total']))#, name='descCarburante'))
fig.show()

Impostazione dell'indice e riempimento gap con valori precendenti¶

In [45]:
y = gdf[['data','total']].set_index('data')
y.index = y.index.to_period("Y")

Auto Create¶

In [51]:
fh = 3
fold = 3
In [52]:
exp = TSForecastingExperiment()
exp.setup(data=y, fh=fh, fold=fold, session_id=42)
  Description Value
0 session_id 42
1 Target total
2 Approach Univariate
3 Exogenous Variables Not Present
4 Original data shape (23, 1)
5 Transformed data shape (23, 1)
6 Transformed train set shape (20, 1)
7 Transformed test set shape (3, 1)
8 Rows with missing values 0.0%
9 Fold Generator ExpandingWindowSplitter
10 Fold Number 3
11 Enforce Prediction Interval False
12 Splits used for hyperparameters all
13 User Defined Seasonal Period(s) None
14 Ignore Seasonality Test False
15 Seasonality Detection Algo auto
16 Max Period to Consider 60
17 Seasonal Period(s) Tested []
18 Significant Seasonal Period(s) [1]
19 Significant Seasonal Period(s) without Harmonics [1]
20 Remove Harmonics False
21 Harmonics Order Method harmonic_max
22 Num Seasonalities to Use 1
23 All Seasonalities to Use [1]
24 Primary Seasonality 1
25 Seasonality Present False
26 Seasonality Type None
27 Target Strictly Positive True
28 Target White Noise No
29 Recommended d 1
30 Recommended Seasonal D 0
31 Preprocess False
32 CPU Jobs -1
33 Use GPU False
34 Log Experiment False
35 Experiment Name ts-default-name
36 USI b5f8
Out[52]:
<pycaret.time_series.forecasting.oop.TSForecastingExperiment at 0x7ff393afcfd0>

Compare Models¶

In [53]:
# 26min
best_baseline_models = exp.compare_models(fold=fold, sort='smape', n_select=3)
best_baseline_models
  Model MASE RMSSE MAE RMSE MAPE SMAPE R2 TT (Sec)
theta Theta Forecaster 1.1873 1.0991 223.1660 251.7246 0.0059 0.0059 -7.6583 0.0400
naive Naive Forecaster 1.2934 1.2285 243.1111 277.3586 0.0064 0.0064 -18.6039 0.0267
auto_arima Auto ARIMA 1.5301 1.3719 289.3607 315.9818 0.0076 0.0076 -13.2939 0.2567
et_cds_dt Extra Trees w/ Cond. Deseasonalize & Detrending 1.6978 1.6364 322.1227 376.8875 0.0085 0.0085 -25.1448 0.1800
exp_smooth Exponential Smoothing 1.7274 1.5685 327.2908 361.4960 0.0086 0.0086 -20.2304 0.0267
arima ARIMA 1.7389 1.5907 326.4621 358.3690 0.0086 0.0086 -39.7350 0.0633
omp_cds_dt Orthogonal Matching Pursuit w/ Cond. Deseasonalize & Detrending 1.7720 1.6289 333.7403 373.6666 0.0088 0.0088 -18.6676 0.0567
en_cds_dt Elastic Net w/ Cond. Deseasonalize & Detrending 1.7720 1.6290 333.7481 373.6736 0.0088 0.0088 -18.6679 0.0600
lr_cds_dt Linear w/ Cond. Deseasonalize & Detrending 1.7720 1.6289 333.7403 373.6666 0.0088 0.0088 -18.6676 0.0733
ridge_cds_dt Ridge w/ Cond. Deseasonalize & Detrending 1.7720 1.6289 333.7407 373.6669 0.0088 0.0088 -18.6676 0.0633
lasso_cds_dt Lasso w/ Cond. Deseasonalize & Detrending 1.7720 1.6290 333.7499 373.6754 0.0088 0.0088 -18.6681 0.0667
llar_cds_dt Lasso Least Angular Regressor w/ Cond. Deseasonalize & Detrending 1.7720 1.6290 333.7499 373.6754 0.0088 0.0088 -18.6681 0.0733
huber_cds_dt Huber w/ Cond. Deseasonalize & Detrending 1.8360 1.6735 344.6931 383.1269 0.0091 0.0091 -18.3441 0.0667
ada_cds_dt AdaBoost w/ Cond. Deseasonalize & Detrending 1.8885 1.7399 358.9872 400.7277 0.0095 0.0094 -31.1802 0.1167
dt_cds_dt Decision Tree w/ Cond. Deseasonalize & Detrending 1.9023 1.7722 361.6294 408.2775 0.0095 0.0095 -31.9375 0.0700
br_cds_dt Bayesian Ridge w/ Cond. Deseasonalize & Detrending 1.9318 1.7452 362.7290 399.5703 0.0096 0.0095 -20.1384 0.0767
gbr_cds_dt Gradient Boosting w/ Cond. Deseasonalize & Detrending 1.9011 1.7708 361.3701 407.9285 0.0095 0.0095 -31.8227 0.0833
rf_cds_dt Random Forest w/ Cond. Deseasonalize & Detrending 1.9694 1.8483 370.3568 423.4216 0.0098 0.0097 -23.7242 0.2333
knn_cds_dt K Neighbors w/ Cond. Deseasonalize & Detrending 2.4358 2.1823 450.8498 495.0524 0.0119 0.0118 -21.3223 0.1000
lightgbm_cds_dt Light Gradient Boosting w/ Cond. Deseasonalize & Detrending 2.6184 2.2990 483.3795 520.1676 0.0128 0.0127 -22.4420 2.0467
polytrend Polynomial Trend Forecaster 2.6548 2.3122 490.0676 523.1799 0.0129 0.0128 -21.9418 0.0367
ets ETS 2.6558 2.3129 490.2546 523.3472 0.0129 0.0128 -21.9581 0.0500
croston Croston 5.3384 4.4858 1007.3380 1020.6685 0.0265 0.0269 -286.6918 0.0400
grand_means Grand Means Forecaster 5.4653 4.5778 1028.1514 1040.1844 0.0271 0.0275 -264.8290 0.0233
Out[53]:
[ThetaForecaster(),
 NaiveForecaster(),
 AutoARIMA(random_state=42, suppress_warnings=True)]
In [54]:
compare_metrics = exp.pull()
compare_metrics
Out[54]:
Model MASE RMSSE MAE RMSE MAPE SMAPE R2 TT (Sec)
theta Theta Forecaster 1.1873 1.0991 223.166 251.7246 0.0059 0.0059 -7.6583 0.0400
naive Naive Forecaster 1.2934 1.2285 243.1111 277.3586 0.0064 0.0064 -18.6039 0.0267
auto_arima Auto ARIMA 1.5301 1.3719 289.3607 315.9818 0.0076 0.0076 -13.2939 0.2567
et_cds_dt Extra Trees w/ Cond. Deseasonalize & Detrending 1.6978 1.6364 322.1227 376.8875 0.0085 0.0085 -25.1448 0.1800
exp_smooth Exponential Smoothing 1.7274 1.5685 327.2908 361.496 0.0086 0.0086 -20.2304 0.0267
arima ARIMA 1.7389 1.5907 326.4621 358.369 0.0086 0.0086 -39.735 0.0633
omp_cds_dt Orthogonal Matching Pursuit w/ Cond. Deseasona... 1.772 1.6289 333.7403 373.6666 0.0088 0.0088 -18.6676 0.0567
en_cds_dt Elastic Net w/ Cond. Deseasonalize & Detrending 1.772 1.629 333.7481 373.6736 0.0088 0.0088 -18.6679 0.0600
lr_cds_dt Linear w/ Cond. Deseasonalize & Detrending 1.772 1.6289 333.7403 373.6666 0.0088 0.0088 -18.6676 0.0733
ridge_cds_dt Ridge w/ Cond. Deseasonalize & Detrending 1.772 1.6289 333.7407 373.6669 0.0088 0.0088 -18.6676 0.0633
lasso_cds_dt Lasso w/ Cond. Deseasonalize & Detrending 1.772 1.629 333.7499 373.6754 0.0088 0.0088 -18.6681 0.0667
llar_cds_dt Lasso Least Angular Regressor w/ Cond. Deseaso... 1.772 1.629 333.7499 373.6754 0.0088 0.0088 -18.6681 0.0733
huber_cds_dt Huber w/ Cond. Deseasonalize & Detrending 1.836 1.6735 344.6931 383.1269 0.0091 0.0091 -18.3441 0.0667
ada_cds_dt AdaBoost w/ Cond. Deseasonalize & Detrending 1.8885 1.7399 358.9872 400.7277 0.0095 0.0094 -31.1802 0.1167
dt_cds_dt Decision Tree w/ Cond. Deseasonalize & Detrending 1.9023 1.7722 361.6294 408.2775 0.0095 0.0095 -31.9375 0.0700
br_cds_dt Bayesian Ridge w/ Cond. Deseasonalize & Detren... 1.9318 1.7452 362.729 399.5703 0.0096 0.0095 -20.1384 0.0767
gbr_cds_dt Gradient Boosting w/ Cond. Deseasonalize & Det... 1.9011 1.7708 361.3701 407.9285 0.0095 0.0095 -31.8227 0.0833
rf_cds_dt Random Forest w/ Cond. Deseasonalize & Detrending 1.9694 1.8483 370.3568 423.4216 0.0098 0.0097 -23.7242 0.2333
knn_cds_dt K Neighbors w/ Cond. Deseasonalize & Detrending 2.4358 2.1823 450.8498 495.0524 0.0119 0.0118 -21.3223 0.1000
lightgbm_cds_dt Light Gradient Boosting w/ Cond. Deseasonalize... 2.6184 2.299 483.3795 520.1676 0.0128 0.0127 -22.442 2.0467
polytrend Polynomial Trend Forecaster 2.6548 2.3122 490.0676 523.1799 0.0129 0.0128 -21.9418 0.0367
ets ETS 2.6558 2.3129 490.2546 523.3472 0.0129 0.0128 -21.9581 0.0500
croston Croston 5.3384 4.4858 1007.338 1020.6685 0.0265 0.0269 -286.6918 0.0400
grand_means Grand Means Forecaster 5.4653 4.5778 1028.1514 1040.1844 0.0271 0.0275 -264.829 0.0233

Tune Best Models¶

In [55]:
best_tuned_models = [exp.tune_model(model) for model in best_baseline_models]
best_tuned_models
  cutoff MASE RMSSE MAE RMSE MAPE SMAPE R2
0 2011 0.7649 0.7851 149.0836 180.1416 0.0039 0.0039 -11.6861
1 2014 1.9450 1.7803 370.7380 415.6881 0.0098 0.0097 -11.7454
2 2017 0.8519 0.7318 149.6764 159.3441 0.0039 0.0039 0.4566
Mean NaT 1.1873 1.0991 223.1660 251.7246 0.0059 0.0059 -7.6583
SD NaT 0.5369 0.4822 104.3494 116.2502 0.0028 0.0027 5.7381
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished
  cutoff MASE RMSSE MAE RMSE MAPE SMAPE R2
0 2011 0.8791 0.8082 171.3333 185.4476 0.0045 0.0045 -12.4444
1 2014 2.0554 1.9520 391.7949 455.7935 0.0104 0.0103 -14.3234
2 2017 0.7134 0.6198 125.3333 134.9511 0.0033 0.0033 0.6102
Mean NaT 1.2160 1.1267 229.4872 258.7308 0.0061 0.0060 -8.7192
SD NaT 0.5974 0.5887 116.2951 140.8611 0.0031 0.0031 6.6413
Fitting 3 folds for each of 2 candidates, totalling 6 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   6 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    0.1s finished
  cutoff MASE RMSSE MAE RMSE MAPE SMAPE R2
0 2011 0.8791 0.8082 171.3333 185.4476 0.0045 0.0045 -12.4444
1 2014 2.9977 2.6876 571.4154 627.5467 0.0151 0.0150 -28.0476
2 2017 0.7134 0.6198 125.3333 134.9511 0.0033 0.0033 0.6102
Mean NaT 1.5301 1.3719 289.3607 315.9818 0.0076 0.0076 -13.2939
SD NaT 1.0400 0.9335 200.3250 221.2721 0.0053 0.0052 11.7149
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.6s finished
Out[55]:
[ThetaForecaster(),
 NaiveForecaster(strategy='drift'),
 AutoARIMA(random_state=42, suppress_warnings=True)]

Voting Blender¶

In [56]:
top_model_metrics = compare_metrics.iloc[0:3]['SMAPE']
display(top_model_metrics)

top_model_weights = 1 - top_model_metrics/top_model_metrics.sum()
display(top_model_weights)
theta         0.0059
naive         0.0064
auto_arima    0.0076
Name: SMAPE, dtype: object
theta         0.703518
naive         0.678392
auto_arima     0.61809
Name: SMAPE, dtype: object
In [57]:
voting_blender = exp.blend_models(best_tuned_models, method='mean', weights=top_model_weights.values.tolist())
  cutoff MASE RMSSE MAE RMSE MAPE SMAPE R2
0 2011 0.8389 0.7712 163.5068 176.9504 0.0043 0.0043 -11.2406
1 2014 2.3078 2.1166 439.8988 494.2243 0.0116 0.0116 -17.0164
2 2017 0.7621 0.6586 133.8962 143.3981 0.0035 0.0035 0.5599
Mean NaT 1.3029 1.1821 245.7673 271.5243 0.0065 0.0065 -9.2324
SD NaT 0.7112 0.6624 137.8030 158.0673 0.0037 0.0036 7.3146
In [58]:
y_predict = exp.predict_model(voting_blender)
print(y_predict)
exp.plot_model(estimator=voting_blender)
  Model MASE RMSSE MAE RMSE MAPE SMAPE R2
0 EnsembleForecaster 1.0001 0.9278 175.4905 203.5851 0.0045 0.0046 -4.5873
          y_pred
2021  38366.8370
2022  38462.5095
2023  38558.1821

Finalize Model¶

In [59]:
final_model = exp.finalize_model(voting_blender)
print(exp.predict_model(final_model))
exp.plot_model(final_model, data_kwargs={'fh':10})
          y_pred
2024  38840.6328
2025  38939.9321
2026  39039.2314
In [ ]:
 
In [ ]:
 
In [ ]:
 

Save Model¶

In [17]:
_ = exp.save_model(final_model, "my_blender")
Transformation Pipeline and Model Successfully Saved

Load Model¶

In [ ]:
loaded_exp = TSForecastingExperiment()
m = loaded_exp.load_model("my_blender")
# Predictions should be same as before the model was saved and loaded
loaded_exp.predict_model(m)
In [ ]: