In [1]:
import time
import glob
import os
import numpy as np
import pandas as pd
import geopandas as gpd
import plotly
import plotly.graph_objects as go
from pycaret.datasets import get_data
#from pycaret.internal.pycaret_experiment import TimeSeriesExperiment
from pycaret.time_series import TSForecastingExperiment
plotly.offline.init_notebook_mode()
In [2]:
data_path = os.path.join(os.path.expanduser('~'),'ILAB_DATA','ISTAT', 'DATA')
Caricamento dei dati¶
In [3]:
# 25min
comuni_all_pop = pd.read_parquet(os.path.join(data_path,'comuni_all_pop.parquet'))
comuni_all = gpd.read_parquet(os.path.join(data_path,'comuni_all.parquet'))
comuni_all_pop['TIME_PERIOD'] = comuni_all_pop['TIME_PERIOD'].astype(int)
comuni_all_pop = comuni_all_pop.merge(comuni_all[['pro_com_t','anno','comune']], left_on=['REF_AREA','TIME_PERIOD'], right_on=['pro_com_t','anno'])
In [42]:
# Ciampino 058118
# Roma 058091
gdf = comuni_all_pop.loc[comuni_all_pop['REF_AREA']=='058118',['TIME_PERIOD','total']].copy()
Conversione in datetime e ordinamento¶
In [43]:
gdf['data'] = pd.to_datetime(gdf['TIME_PERIOD'], format='%Y')
gdf = gdf.sort_values(by='data', ascending=True)
Visualizzazione dati¶
In [44]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=gdf['data'], y=gdf['total']))#, name='descCarburante'))
fig.show()
Impostazione dell'indice e riempimento gap con valori precendenti¶
In [45]:
y = gdf[['data','total']].set_index('data')
y.index = y.index.to_period("Y")
Auto Create¶
In [51]:
fh = 3
fold = 3
In [52]:
exp = TSForecastingExperiment()
exp.setup(data=y, fh=fh, fold=fold, session_id=42)
| Description | Value | |
|---|---|---|
| 0 | session_id | 42 |
| 1 | Target | total |
| 2 | Approach | Univariate |
| 3 | Exogenous Variables | Not Present |
| 4 | Original data shape | (23, 1) |
| 5 | Transformed data shape | (23, 1) |
| 6 | Transformed train set shape | (20, 1) |
| 7 | Transformed test set shape | (3, 1) |
| 8 | Rows with missing values | 0.0% |
| 9 | Fold Generator | ExpandingWindowSplitter |
| 10 | Fold Number | 3 |
| 11 | Enforce Prediction Interval | False |
| 12 | Splits used for hyperparameters | all |
| 13 | User Defined Seasonal Period(s) | None |
| 14 | Ignore Seasonality Test | False |
| 15 | Seasonality Detection Algo | auto |
| 16 | Max Period to Consider | 60 |
| 17 | Seasonal Period(s) Tested | [] |
| 18 | Significant Seasonal Period(s) | [1] |
| 19 | Significant Seasonal Period(s) without Harmonics | [1] |
| 20 | Remove Harmonics | False |
| 21 | Harmonics Order Method | harmonic_max |
| 22 | Num Seasonalities to Use | 1 |
| 23 | All Seasonalities to Use | [1] |
| 24 | Primary Seasonality | 1 |
| 25 | Seasonality Present | False |
| 26 | Seasonality Type | None |
| 27 | Target Strictly Positive | True |
| 28 | Target White Noise | No |
| 29 | Recommended d | 1 |
| 30 | Recommended Seasonal D | 0 |
| 31 | Preprocess | False |
| 32 | CPU Jobs | -1 |
| 33 | Use GPU | False |
| 34 | Log Experiment | False |
| 35 | Experiment Name | ts-default-name |
| 36 | USI | b5f8 |
Out[52]:
<pycaret.time_series.forecasting.oop.TSForecastingExperiment at 0x7ff393afcfd0>
Compare Models¶
In [53]:
# 26min
best_baseline_models = exp.compare_models(fold=fold, sort='smape', n_select=3)
best_baseline_models
| Model | MASE | RMSSE | MAE | RMSE | MAPE | SMAPE | R2 | TT (Sec) | |
|---|---|---|---|---|---|---|---|---|---|
| theta | Theta Forecaster | 1.1873 | 1.0991 | 223.1660 | 251.7246 | 0.0059 | 0.0059 | -7.6583 | 0.0400 |
| naive | Naive Forecaster | 1.2934 | 1.2285 | 243.1111 | 277.3586 | 0.0064 | 0.0064 | -18.6039 | 0.0267 |
| auto_arima | Auto ARIMA | 1.5301 | 1.3719 | 289.3607 | 315.9818 | 0.0076 | 0.0076 | -13.2939 | 0.2567 |
| et_cds_dt | Extra Trees w/ Cond. Deseasonalize & Detrending | 1.6978 | 1.6364 | 322.1227 | 376.8875 | 0.0085 | 0.0085 | -25.1448 | 0.1800 |
| exp_smooth | Exponential Smoothing | 1.7274 | 1.5685 | 327.2908 | 361.4960 | 0.0086 | 0.0086 | -20.2304 | 0.0267 |
| arima | ARIMA | 1.7389 | 1.5907 | 326.4621 | 358.3690 | 0.0086 | 0.0086 | -39.7350 | 0.0633 |
| omp_cds_dt | Orthogonal Matching Pursuit w/ Cond. Deseasonalize & Detrending | 1.7720 | 1.6289 | 333.7403 | 373.6666 | 0.0088 | 0.0088 | -18.6676 | 0.0567 |
| en_cds_dt | Elastic Net w/ Cond. Deseasonalize & Detrending | 1.7720 | 1.6290 | 333.7481 | 373.6736 | 0.0088 | 0.0088 | -18.6679 | 0.0600 |
| lr_cds_dt | Linear w/ Cond. Deseasonalize & Detrending | 1.7720 | 1.6289 | 333.7403 | 373.6666 | 0.0088 | 0.0088 | -18.6676 | 0.0733 |
| ridge_cds_dt | Ridge w/ Cond. Deseasonalize & Detrending | 1.7720 | 1.6289 | 333.7407 | 373.6669 | 0.0088 | 0.0088 | -18.6676 | 0.0633 |
| lasso_cds_dt | Lasso w/ Cond. Deseasonalize & Detrending | 1.7720 | 1.6290 | 333.7499 | 373.6754 | 0.0088 | 0.0088 | -18.6681 | 0.0667 |
| llar_cds_dt | Lasso Least Angular Regressor w/ Cond. Deseasonalize & Detrending | 1.7720 | 1.6290 | 333.7499 | 373.6754 | 0.0088 | 0.0088 | -18.6681 | 0.0733 |
| huber_cds_dt | Huber w/ Cond. Deseasonalize & Detrending | 1.8360 | 1.6735 | 344.6931 | 383.1269 | 0.0091 | 0.0091 | -18.3441 | 0.0667 |
| ada_cds_dt | AdaBoost w/ Cond. Deseasonalize & Detrending | 1.8885 | 1.7399 | 358.9872 | 400.7277 | 0.0095 | 0.0094 | -31.1802 | 0.1167 |
| dt_cds_dt | Decision Tree w/ Cond. Deseasonalize & Detrending | 1.9023 | 1.7722 | 361.6294 | 408.2775 | 0.0095 | 0.0095 | -31.9375 | 0.0700 |
| br_cds_dt | Bayesian Ridge w/ Cond. Deseasonalize & Detrending | 1.9318 | 1.7452 | 362.7290 | 399.5703 | 0.0096 | 0.0095 | -20.1384 | 0.0767 |
| gbr_cds_dt | Gradient Boosting w/ Cond. Deseasonalize & Detrending | 1.9011 | 1.7708 | 361.3701 | 407.9285 | 0.0095 | 0.0095 | -31.8227 | 0.0833 |
| rf_cds_dt | Random Forest w/ Cond. Deseasonalize & Detrending | 1.9694 | 1.8483 | 370.3568 | 423.4216 | 0.0098 | 0.0097 | -23.7242 | 0.2333 |
| knn_cds_dt | K Neighbors w/ Cond. Deseasonalize & Detrending | 2.4358 | 2.1823 | 450.8498 | 495.0524 | 0.0119 | 0.0118 | -21.3223 | 0.1000 |
| lightgbm_cds_dt | Light Gradient Boosting w/ Cond. Deseasonalize & Detrending | 2.6184 | 2.2990 | 483.3795 | 520.1676 | 0.0128 | 0.0127 | -22.4420 | 2.0467 |
| polytrend | Polynomial Trend Forecaster | 2.6548 | 2.3122 | 490.0676 | 523.1799 | 0.0129 | 0.0128 | -21.9418 | 0.0367 |
| ets | ETS | 2.6558 | 2.3129 | 490.2546 | 523.3472 | 0.0129 | 0.0128 | -21.9581 | 0.0500 |
| croston | Croston | 5.3384 | 4.4858 | 1007.3380 | 1020.6685 | 0.0265 | 0.0269 | -286.6918 | 0.0400 |
| grand_means | Grand Means Forecaster | 5.4653 | 4.5778 | 1028.1514 | 1040.1844 | 0.0271 | 0.0275 | -264.8290 | 0.0233 |
Out[53]:
[ThetaForecaster(), NaiveForecaster(), AutoARIMA(random_state=42, suppress_warnings=True)]
In [54]:
compare_metrics = exp.pull()
compare_metrics
Out[54]:
| Model | MASE | RMSSE | MAE | RMSE | MAPE | SMAPE | R2 | TT (Sec) | |
|---|---|---|---|---|---|---|---|---|---|
| theta | Theta Forecaster | 1.1873 | 1.0991 | 223.166 | 251.7246 | 0.0059 | 0.0059 | -7.6583 | 0.0400 |
| naive | Naive Forecaster | 1.2934 | 1.2285 | 243.1111 | 277.3586 | 0.0064 | 0.0064 | -18.6039 | 0.0267 |
| auto_arima | Auto ARIMA | 1.5301 | 1.3719 | 289.3607 | 315.9818 | 0.0076 | 0.0076 | -13.2939 | 0.2567 |
| et_cds_dt | Extra Trees w/ Cond. Deseasonalize & Detrending | 1.6978 | 1.6364 | 322.1227 | 376.8875 | 0.0085 | 0.0085 | -25.1448 | 0.1800 |
| exp_smooth | Exponential Smoothing | 1.7274 | 1.5685 | 327.2908 | 361.496 | 0.0086 | 0.0086 | -20.2304 | 0.0267 |
| arima | ARIMA | 1.7389 | 1.5907 | 326.4621 | 358.369 | 0.0086 | 0.0086 | -39.735 | 0.0633 |
| omp_cds_dt | Orthogonal Matching Pursuit w/ Cond. Deseasona... | 1.772 | 1.6289 | 333.7403 | 373.6666 | 0.0088 | 0.0088 | -18.6676 | 0.0567 |
| en_cds_dt | Elastic Net w/ Cond. Deseasonalize & Detrending | 1.772 | 1.629 | 333.7481 | 373.6736 | 0.0088 | 0.0088 | -18.6679 | 0.0600 |
| lr_cds_dt | Linear w/ Cond. Deseasonalize & Detrending | 1.772 | 1.6289 | 333.7403 | 373.6666 | 0.0088 | 0.0088 | -18.6676 | 0.0733 |
| ridge_cds_dt | Ridge w/ Cond. Deseasonalize & Detrending | 1.772 | 1.6289 | 333.7407 | 373.6669 | 0.0088 | 0.0088 | -18.6676 | 0.0633 |
| lasso_cds_dt | Lasso w/ Cond. Deseasonalize & Detrending | 1.772 | 1.629 | 333.7499 | 373.6754 | 0.0088 | 0.0088 | -18.6681 | 0.0667 |
| llar_cds_dt | Lasso Least Angular Regressor w/ Cond. Deseaso... | 1.772 | 1.629 | 333.7499 | 373.6754 | 0.0088 | 0.0088 | -18.6681 | 0.0733 |
| huber_cds_dt | Huber w/ Cond. Deseasonalize & Detrending | 1.836 | 1.6735 | 344.6931 | 383.1269 | 0.0091 | 0.0091 | -18.3441 | 0.0667 |
| ada_cds_dt | AdaBoost w/ Cond. Deseasonalize & Detrending | 1.8885 | 1.7399 | 358.9872 | 400.7277 | 0.0095 | 0.0094 | -31.1802 | 0.1167 |
| dt_cds_dt | Decision Tree w/ Cond. Deseasonalize & Detrending | 1.9023 | 1.7722 | 361.6294 | 408.2775 | 0.0095 | 0.0095 | -31.9375 | 0.0700 |
| br_cds_dt | Bayesian Ridge w/ Cond. Deseasonalize & Detren... | 1.9318 | 1.7452 | 362.729 | 399.5703 | 0.0096 | 0.0095 | -20.1384 | 0.0767 |
| gbr_cds_dt | Gradient Boosting w/ Cond. Deseasonalize & Det... | 1.9011 | 1.7708 | 361.3701 | 407.9285 | 0.0095 | 0.0095 | -31.8227 | 0.0833 |
| rf_cds_dt | Random Forest w/ Cond. Deseasonalize & Detrending | 1.9694 | 1.8483 | 370.3568 | 423.4216 | 0.0098 | 0.0097 | -23.7242 | 0.2333 |
| knn_cds_dt | K Neighbors w/ Cond. Deseasonalize & Detrending | 2.4358 | 2.1823 | 450.8498 | 495.0524 | 0.0119 | 0.0118 | -21.3223 | 0.1000 |
| lightgbm_cds_dt | Light Gradient Boosting w/ Cond. Deseasonalize... | 2.6184 | 2.299 | 483.3795 | 520.1676 | 0.0128 | 0.0127 | -22.442 | 2.0467 |
| polytrend | Polynomial Trend Forecaster | 2.6548 | 2.3122 | 490.0676 | 523.1799 | 0.0129 | 0.0128 | -21.9418 | 0.0367 |
| ets | ETS | 2.6558 | 2.3129 | 490.2546 | 523.3472 | 0.0129 | 0.0128 | -21.9581 | 0.0500 |
| croston | Croston | 5.3384 | 4.4858 | 1007.338 | 1020.6685 | 0.0265 | 0.0269 | -286.6918 | 0.0400 |
| grand_means | Grand Means Forecaster | 5.4653 | 4.5778 | 1028.1514 | 1040.1844 | 0.0271 | 0.0275 | -264.829 | 0.0233 |
Tune Best Models¶
In [55]:
best_tuned_models = [exp.tune_model(model) for model in best_baseline_models]
best_tuned_models
| cutoff | MASE | RMSSE | MAE | RMSE | MAPE | SMAPE | R2 | |
|---|---|---|---|---|---|---|---|---|
| 0 | 2011 | 0.7649 | 0.7851 | 149.0836 | 180.1416 | 0.0039 | 0.0039 | -11.6861 |
| 1 | 2014 | 1.9450 | 1.7803 | 370.7380 | 415.6881 | 0.0098 | 0.0097 | -11.7454 |
| 2 | 2017 | 0.8519 | 0.7318 | 149.6764 | 159.3441 | 0.0039 | 0.0039 | 0.4566 |
| Mean | NaT | 1.1873 | 1.0991 | 223.1660 | 251.7246 | 0.0059 | 0.0059 | -7.6583 |
| SD | NaT | 0.5369 | 0.4822 | 104.3494 | 116.2502 | 0.0028 | 0.0027 | 5.7381 |
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers. [Parallel(n_jobs=-1)]: Done 3 out of 3 | elapsed: 0.1s finished
| cutoff | MASE | RMSSE | MAE | RMSE | MAPE | SMAPE | R2 | |
|---|---|---|---|---|---|---|---|---|
| 0 | 2011 | 0.8791 | 0.8082 | 171.3333 | 185.4476 | 0.0045 | 0.0045 | -12.4444 |
| 1 | 2014 | 2.0554 | 1.9520 | 391.7949 | 455.7935 | 0.0104 | 0.0103 | -14.3234 |
| 2 | 2017 | 0.7134 | 0.6198 | 125.3333 | 134.9511 | 0.0033 | 0.0033 | 0.6102 |
| Mean | NaT | 1.2160 | 1.1267 | 229.4872 | 258.7308 | 0.0061 | 0.0060 | -8.7192 |
| SD | NaT | 0.5974 | 0.5887 | 116.2951 | 140.8611 | 0.0031 | 0.0031 | 6.6413 |
Fitting 3 folds for each of 2 candidates, totalling 6 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers. [Parallel(n_jobs=-1)]: Done 4 out of 6 | elapsed: 0.1s remaining: 0.0s [Parallel(n_jobs=-1)]: Done 6 out of 6 | elapsed: 0.1s finished
| cutoff | MASE | RMSSE | MAE | RMSE | MAPE | SMAPE | R2 | |
|---|---|---|---|---|---|---|---|---|
| 0 | 2011 | 0.8791 | 0.8082 | 171.3333 | 185.4476 | 0.0045 | 0.0045 | -12.4444 |
| 1 | 2014 | 2.9977 | 2.6876 | 571.4154 | 627.5467 | 0.0151 | 0.0150 | -28.0476 |
| 2 | 2017 | 0.7134 | 0.6198 | 125.3333 | 134.9511 | 0.0033 | 0.0033 | 0.6102 |
| Mean | NaT | 1.5301 | 1.3719 | 289.3607 | 315.9818 | 0.0076 | 0.0076 | -13.2939 |
| SD | NaT | 1.0400 | 0.9335 | 200.3250 | 221.2721 | 0.0053 | 0.0052 | 11.7149 |
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers. [Parallel(n_jobs=-1)]: Done 3 out of 3 | elapsed: 0.6s finished
Out[55]:
[ThetaForecaster(), NaiveForecaster(strategy='drift'), AutoARIMA(random_state=42, suppress_warnings=True)]
Voting Blender¶
In [56]:
top_model_metrics = compare_metrics.iloc[0:3]['SMAPE']
display(top_model_metrics)
top_model_weights = 1 - top_model_metrics/top_model_metrics.sum()
display(top_model_weights)
theta 0.0059 naive 0.0064 auto_arima 0.0076 Name: SMAPE, dtype: object
theta 0.703518 naive 0.678392 auto_arima 0.61809 Name: SMAPE, dtype: object
In [57]:
voting_blender = exp.blend_models(best_tuned_models, method='mean', weights=top_model_weights.values.tolist())
| cutoff | MASE | RMSSE | MAE | RMSE | MAPE | SMAPE | R2 | |
|---|---|---|---|---|---|---|---|---|
| 0 | 2011 | 0.8389 | 0.7712 | 163.5068 | 176.9504 | 0.0043 | 0.0043 | -11.2406 |
| 1 | 2014 | 2.3078 | 2.1166 | 439.8988 | 494.2243 | 0.0116 | 0.0116 | -17.0164 |
| 2 | 2017 | 0.7621 | 0.6586 | 133.8962 | 143.3981 | 0.0035 | 0.0035 | 0.5599 |
| Mean | NaT | 1.3029 | 1.1821 | 245.7673 | 271.5243 | 0.0065 | 0.0065 | -9.2324 |
| SD | NaT | 0.7112 | 0.6624 | 137.8030 | 158.0673 | 0.0037 | 0.0036 | 7.3146 |
In [58]:
y_predict = exp.predict_model(voting_blender)
print(y_predict)
exp.plot_model(estimator=voting_blender)
| Model | MASE | RMSSE | MAE | RMSE | MAPE | SMAPE | R2 | |
|---|---|---|---|---|---|---|---|---|
| 0 | EnsembleForecaster | 1.0001 | 0.9278 | 175.4905 | 203.5851 | 0.0045 | 0.0046 | -4.5873 |
y_pred 2021 38366.8370 2022 38462.5095 2023 38558.1821
Finalize Model¶
In [59]:
final_model = exp.finalize_model(voting_blender)
print(exp.predict_model(final_model))
exp.plot_model(final_model, data_kwargs={'fh':10})
y_pred 2024 38840.6328 2025 38939.9321 2026 39039.2314
In [ ]:
In [ ]:
In [ ]:
Save Model¶
In [17]:
_ = exp.save_model(final_model, "my_blender")
Transformation Pipeline and Model Successfully Saved
Load Model¶
In [ ]:
loaded_exp = TSForecastingExperiment()
m = loaded_exp.load_model("my_blender")
# Predictions should be same as before the model was saved and loaded
loaded_exp.predict_model(m)
In [ ]: