import pandas as pd
import os
import xarray as xr
import hashlib
import plotly.graph_objects as go

from pycaret.time_series import TSForecastingExperiment
from datetime import datetime, timedelta
from tqdm import tqdm
from glob import glob
from math import dist

def get_id_cell(row, distance_in):
    grid_distance = str(distance_in)[:3].replace('.','_')
    id_cell = f'{grid_distance}__{row["latitude"]}_{row["longitude"]}'
    result = hashlib.md5(id_cell.encode()).hexdigest()
    return result

cams_nc_path = os.path.join(os.path.expanduser('~'), 'ILAB_DATA', 'COPERNICUS','NC')
cams_data_path = os.path.join(os.path.expanduser('~'), 'ILAB_DATA', 'COPERNICUS','DATA')

file_list = glob(os.path.join(cams_nc_path,'*.nc'))
file_list.sort()
tot = len(file_list)
output = list()
for i,file in enumerate(file_list,start=1):
    head, tail = os.path.split(file)
    fname = tail.replace('.nc','').split('_')
    loc = fname[1]
    if i==1:
        start_m = fname[2]
    elif i==tot:
        end_m = fname[2]
    output.append((head, tail))
print(start_m,end_m)

202105 202405

cell_sel = '54cfb985fad9555c04e4b4135a471259'

out = pd.DataFrame()
for file in tqdm(output,total=len(output)):
    cams_data_fname = os.path.join(file[0],file[1])
    ds = xr.open_dataset(cams_data_fname)
    df = ds.to_dataframe().reset_index()
    ds.close()
    df.columns = [col.replace('_conc','') for col in df.columns]
    variables = list(df.columns)
    variables = [element for element in variables if element not in ['longitude','latitude','level','time']]
    aggs_dict = {element: 'mean' for element in variables}
    timestart = datetime.strptime(file[1].split('_')[2].split('.')[0], '%Y%m')
    timeend = datetime.strptime((timestart + timedelta(days=31)).strftime('%Y-%m-01'),'%Y-%m-%d')
    df['time_abs'] = timestart + df['time']   
    df = df[df['time_abs']<timeend].copy()
    lons = df[df['time_abs']==timestart].sort_values(by=['latitude','longitude']).head(2)['longitude'].values
    lats = df[df['time_abs']==timestart].sort_values(by=['latitude','longitude']).head(2)['latitude'].values
    distance = dist([lons[1]], [lons[0]])
    df['id_cell'] = df.apply(get_id_cell, distance_in=distance, axis='columns')
    df.drop(columns=['longitude', 'latitude', 'level', 'time'], inplace=True)
    d_columns = ['pm2p5_total_om','ecres','ectot']
    for d_column in d_columns:
        if d_column in df.columns:
            del df[d_column]
    df_cell = df[df['id_cell']==cell_sel].copy()
    out = pd.concat([out,df_cell])
out.sort_values(by='time_abs',inplace=True)

  0%|          | 0/37 [00:00<?, ?it/s]

100%|██████████| 37/37 [03:26<00:00,  5.58s/it]

print(out['time_abs'].min(), out['time_abs'].max())

2021-05-01 00:00:00 2024-05-31 23:00:00

fig = go.Figure()
fig.add_trace(go.Scatter(x=out['time_abs'],y=out['pm2p5'],name='pm2p5'))
fig.show()

day = '2021-05-01'
day_start = datetime.strptime(day,'%Y-%m-%d').date()
df_giorno = out[out['time_abs'].dt.date == day_start]

fig = go.Figure()
fig.add_trace(go.Scatter(x=df_giorno['time_abs'],y=df_giorno['pm2p5'],name='pm2p5'))
fig.show()

day_end = day_start + timedelta(days=31)
df_week = out[(out['time_abs'].dt.date >= day_start) & (out['time_abs'].dt.date < day_end)]

fig = go.Figure()
fig.add_trace(go.Scatter(x=df_week['time_abs'],y=df_week['pm2p5'],name='pm2p5'))
fig.show()

out.set_index('time_abs', inplace=True)

y = out['pm2p5']
y = y.resample('1D').mean()
#y = y.resample('1H').mean().ffill()

y = y.loc['2024-01-01':].copy()

#y = y.loc['2024-04-01':'2024-06-01'].copy()

fh = 24
fold = 3

exp = TSForecastingExperiment()
exp.setup(data=y, fh=fh, fold=fold, session_id=42)

<pycaret.time_series.forecasting.oop.TSForecastingExperiment at 0x1819354dcd0>

# 26min
best_baseline_models = exp.compare_models(fold=fold, sort='smape', n_select=3)
best_baseline_models

[STLForecaster(), NaiveForecaster(sp=2), NaiveForecaster()]

compare_metrics = exp.pull()
compare_metrics

best_tuned_models = [exp.tune_model(model) for model in best_baseline_models]
best_tuned_models

Fitting 3 folds for each of 10 candidates, totalling 30 fits

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.2s finished

Fitting 3 folds for each of 4 candidates, totalling 12 fits

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of  12 | elapsed:    0.0s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    0.0s finished

Fitting 3 folds for each of 2 candidates, totalling 6 fits

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    0.0s finished

[STLForecaster(low_pass_deg=0, seasonal_deg=0),
 NaiveForecaster(sp=2),
 NaiveForecaster(strategy='drift')]

top_model_metrics = compare_metrics.iloc[0:3]['SMAPE']
display(top_model_metrics)

top_model_weights = 1 - top_model_metrics/top_model_metrics.sum()
display(top_model_weights)

stlf      0.1695
snaive    0.1885
naive     0.1954
Name: SMAPE, dtype: object

stlf      0.693712
snaive    0.659378
naive      0.64691
Name: SMAPE, dtype: object

voting_blender = exp.blend_models(best_tuned_models, method='mean', weights=top_model_weights.values.tolist())

y_predict = exp.predict_model(voting_blender)
display(y_predict)
exp.plot_model(estimator=voting_blender)

final_model = exp.finalize_model(voting_blender)
y_pred = exp.predict_model(final_model, return_pred_int=True)
exp.plot_model(final_model, data_kwargs={'fh':10})

exp.plot_model(final_model, data_kwargs={'fh':40})

_ = exp.save_model(final_model, "my_blender")

loaded_exp = TSForecastingExperiment()
m = loaded_exp.load_model("my_blender")
# Predictions should be same as before the model was saved and loaded
loaded_exp.predict_model(m)

	Description	Value
0	session_id	42
1	Target	pm2p5
2	Approach	Univariate
3	Exogenous Variables	Not Present
4	Original data shape	(1488, 1)
5	Transformed data shape	(1488, 1)
6	Transformed train set shape	(1464, 1)
7	Transformed test set shape	(24, 1)
8	Rows with missing values	0.0%
9	Fold Generator	ExpandingWindowSplitter
10	Fold Number	3
11	Enforce Prediction Interval	False
12	Splits used for hyperparameters	all
13	User Defined Seasonal Period(s)	None
14	Ignore Seasonality Test	False
15	Seasonality Detection Algo	auto
16	Max Period to Consider	60
17	Seasonal Period(s) Tested	[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32]
18	Significant Seasonal Period(s)	[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32]
19	Significant Seasonal Period(s) without Harmonics	[32, 30, 28, 27, 22, 24, 26, 17, 18, 19, 20, 21, 23, 25, 29, 31]
20	Remove Harmonics	False
21	Harmonics Order Method	harmonic_max
22	Num Seasonalities to Use	1
23	All Seasonalities to Use	[2]
24	Primary Seasonality	2
25	Seasonality Present	True
26	Seasonality Type	mul
27	Target Strictly Positive	True
28	Target White Noise	No
29	Recommended d	0
30	Recommended Seasonal D	0
31	Preprocess	False
32	CPU Jobs	-1
33	Use GPU	False
34	Log Experiment	False
35	Experiment Name	ts-default-name
36	USI	731b

	Model	MASE	RMSSE	MAE	RMSE	MAPE	SMAPE	R2	TT (Sec)
stlf	STLF	0.8093	0.4719	0.7678	0.8961	0.1840	0.1695	-0.9035	0.0200
snaive	Seasonal Naive Forecaster	0.9340	0.5360	0.8850	1.0165	0.2229	0.1885	-1.5699	0.0233
naive	Naive Forecaster	0.9793	0.5503	0.9276	1.0433	0.2338	0.1954	-1.8426	0.0167
theta	Theta Forecaster	0.9844	0.5531	0.9325	1.0487	0.2351	0.1962	-1.8755	0.0167
gbr_cds_dt	Gradient Boosting w/ Cond. Deseasonalize & Detrending	1.1008	0.6110	1.0425	1.1584	0.2639	0.2143	-2.6040	0.1667
arima	ARIMA	1.1367	0.6599	1.0777	1.2524	0.2592	0.2348	-2.7828	0.0600
croston	Croston	1.2829	0.7003	1.2218	1.3343	0.2852	0.2377	-4.7866	0.0133
auto_arima	Auto ARIMA	1.2380	0.6852	1.1725	1.2990	0.2887	0.2419	-3.3225	1.5467
dt_cds_dt	Decision Tree w/ Cond. Deseasonalize & Detrending	1.5683	0.9662	1.4935	1.8418	0.3471	0.2700	-9.9350	0.1233
lightgbm_cds_dt	Light Gradient Boosting w/ Cond. Deseasonalize & Detrending	1.4650	0.8021	1.3912	1.5243	0.3379	0.2711	-5.2953	0.2600
ada_cds_dt	AdaBoost w/ Cond. Deseasonalize & Detrending	1.4823	0.8163	1.4069	1.5507	0.3460	0.2791	-4.7207	0.1800
et_cds_dt	Extra Trees w/ Cond. Deseasonalize & Detrending	1.7655	0.9864	1.6765	1.8747	0.4072	0.3156	-7.8484	0.3367
omp_cds_dt	Orthogonal Matching Pursuit w/ Cond. Deseasonalize & Detrending	1.9326	1.0719	1.8340	2.0359	0.4493	0.3408	-8.9550	0.1233
huber_cds_dt	Huber w/ Cond. Deseasonalize & Detrending	1.9789	1.0959	1.8776	2.0812	0.4614	0.3445	-9.6701	0.1233
rf_cds_dt	Random Forest w/ Cond. Deseasonalize & Detrending	2.1603	1.1792	2.0471	2.2364	0.5155	0.3538	-13.4532	0.3300
knn_cds_dt	K Neighbors w/ Cond. Deseasonalize & Detrending	2.6284	1.4169	2.4957	2.6931	0.5957	0.4332	-16.6464	0.2600
en_cds_dt	Elastic Net w/ Cond. Deseasonalize & Detrending	2.6619	1.4592	2.5272	2.7729	0.6085	0.4357	-17.4135	0.1233
lasso_cds_dt	Lasso w/ Cond. Deseasonalize & Detrending	2.6824	1.4698	2.5467	2.7931	0.6129	0.4383	-17.6850	0.1333
llar_cds_dt	Lasso Least Angular Regressor w/ Cond. Deseasonalize & Detrending	2.6824	1.4698	2.5467	2.7931	0.6129	0.4383	-17.6850	0.1400
ridge_cds_dt	Ridge w/ Cond. Deseasonalize & Detrending	2.7475	1.5144	2.6083	2.8777	0.6285	0.4428	-18.9700	0.1200
br_cds_dt	Bayesian Ridge w/ Cond. Deseasonalize & Detrending	2.7479	1.5146	2.6087	2.8781	0.6286	0.4429	-18.9762	0.1667
lr_cds_dt	Linear w/ Cond. Deseasonalize & Detrending	2.7493	1.5154	2.6101	2.8796	0.6289	0.4430	-18.9966	0.1267
exp_smooth	Exponential Smoothing	2.8331	1.6164	2.6887	3.0704	0.6425	0.4984	-21.8446	0.0767
grand_means	Grand Means Forecaster	3.8012	1.9295	3.6098	3.6671	0.8446	0.5767	-31.4000	0.0167
polytrend	Polynomial Trend Forecaster	5.0333	2.5381	4.7816	4.8255	1.1058	0.6966	-56.0615	0.0100
ets	ETS	3.1535	1.8227	3.0016	3.4726	0.6624	0.7008	-37.1808	0.1033

	Model	MASE	RMSSE	MAE	RMSE	MAPE	SMAPE	R2	TT (Sec)
stlf	STLF	0.8093	0.4719	0.7678	0.8961	0.184	0.1695	-0.9035	0.0200
snaive	Seasonal Naive Forecaster	0.934	0.536	0.885	1.0165	0.2229	0.1885	-1.5699	0.0233
naive	Naive Forecaster	0.9793	0.5503	0.9276	1.0433	0.2338	0.1954	-1.8426	0.0167
theta	Theta Forecaster	0.9844	0.5531	0.9325	1.0487	0.2351	0.1962	-1.8755	0.0167
gbr_cds_dt	Gradient Boosting w/ Cond. Deseasonalize & Det...	1.1008	0.611	1.0425	1.1584	0.2639	0.2143	-2.604	0.1667
arima	ARIMA	1.1367	0.6599	1.0777	1.2524	0.2592	0.2348	-2.7828	0.0600
croston	Croston	1.2829	0.7003	1.2218	1.3343	0.2852	0.2377	-4.7866	0.0133
auto_arima	Auto ARIMA	1.238	0.6852	1.1725	1.299	0.2887	0.2419	-3.3225	1.5467
dt_cds_dt	Decision Tree w/ Cond. Deseasonalize & Detrending	1.5683	0.9662	1.4935	1.8418	0.3471	0.27	-9.935	0.1233
lightgbm_cds_dt	Light Gradient Boosting w/ Cond. Deseasonalize...	1.465	0.8021	1.3912	1.5243	0.3379	0.2711	-5.2953	0.2600
ada_cds_dt	AdaBoost w/ Cond. Deseasonalize & Detrending	1.4823	0.8163	1.4069	1.5507	0.346	0.2791	-4.7207	0.1800
et_cds_dt	Extra Trees w/ Cond. Deseasonalize & Detrending	1.7655	0.9864	1.6765	1.8747	0.4072	0.3156	-7.8484	0.3367
omp_cds_dt	Orthogonal Matching Pursuit w/ Cond. Deseasona...	1.9326	1.0719	1.834	2.0359	0.4493	0.3408	-8.955	0.1233
huber_cds_dt	Huber w/ Cond. Deseasonalize & Detrending	1.9789	1.0959	1.8776	2.0812	0.4614	0.3445	-9.6701	0.1233
rf_cds_dt	Random Forest w/ Cond. Deseasonalize & Detrending	2.1603	1.1792	2.0471	2.2364	0.5155	0.3538	-13.4532	0.3300
knn_cds_dt	K Neighbors w/ Cond. Deseasonalize & Detrending	2.6284	1.4169	2.4957	2.6931	0.5957	0.4332	-16.6464	0.2600
en_cds_dt	Elastic Net w/ Cond. Deseasonalize & Detrending	2.6619	1.4592	2.5272	2.7729	0.6085	0.4357	-17.4135	0.1233
lasso_cds_dt	Lasso w/ Cond. Deseasonalize & Detrending	2.6824	1.4698	2.5467	2.7931	0.6129	0.4383	-17.685	0.1333
llar_cds_dt	Lasso Least Angular Regressor w/ Cond. Deseaso...	2.6824	1.4698	2.5467	2.7931	0.6129	0.4383	-17.685	0.1400
ridge_cds_dt	Ridge w/ Cond. Deseasonalize & Detrending	2.7475	1.5144	2.6083	2.8777	0.6285	0.4428	-18.97	0.1200
br_cds_dt	Bayesian Ridge w/ Cond. Deseasonalize & Detren...	2.7479	1.5146	2.6087	2.8781	0.6286	0.4429	-18.9762	0.1667
lr_cds_dt	Linear w/ Cond. Deseasonalize & Detrending	2.7493	1.5154	2.6101	2.8796	0.6289	0.443	-18.9966	0.1267
exp_smooth	Exponential Smoothing	2.8331	1.6164	2.6887	3.0704	0.6425	0.4984	-21.8446	0.0767
grand_means	Grand Means Forecaster	3.8012	1.9295	3.6098	3.6671	0.8446	0.5767	-31.4	0.0167
polytrend	Polynomial Trend Forecaster	5.0333	2.5381	4.7816	4.8255	1.1058	0.6966	-56.0615	0.0100
ets	ETS	3.1535	1.8227	3.0016	3.4726	0.6624	0.7008	-37.1808	0.1033

	cutoff	MASE	RMSSE	MAE	RMSE	MAPE	SMAPE	R2
0	2024-05-28 23:00	0.6138	0.3531	0.5879	0.6767	0.1146	0.1212	-0.3633
1	2024-05-29 23:00	0.6450	0.3894	0.6125	0.7404	0.1327	0.1329	-0.4789
2	2024-05-30 23:00	1.1350	0.6519	1.0706	1.2304	0.2990	0.2477	-1.7015
Mean	NaT	0.7979	0.4648	0.7570	0.8825	0.1821	0.1673	-0.8479
SD	NaT	0.2387	0.1331	0.2220	0.2474	0.0830	0.0571	0.6054

	cutoff	MASE	RMSSE	MAE	RMSE	MAPE	SMAPE	R2
0	2024-05-28 23:00	0.5532	0.3206	0.5299	0.6143	0.1113	0.1066	-0.1237
1	2024-05-29 23:00	0.6371	0.3943	0.6050	0.7498	0.1383	0.1289	-0.5165
2	2024-05-30 23:00	1.6116	0.8930	1.5202	1.6855	0.4190	0.3300	-4.0696
Mean	NaT	0.9340	0.5360	0.8850	1.0165	0.2229	0.1885	-1.5699
SD	NaT	0.4804	0.2542	0.4502	0.4762	0.1391	0.1005	1.7748

ATTENZIONE¶

Preprocessing - Da raster a csv¶

Visualizzazioni time trend della variabile pm2.5¶

Andamento orario nei 4 anni¶

Andamento orario per il mese di maggio 2021¶

Auto Create¶

Compare Models¶

Tune Best Models¶

Voting Blender¶

Finalize Model¶

Save Model¶

Load Model¶

	cutoff	MASE	RMSSE	MAE	RMSE	MAPE	SMAPE	R2
0	2024-05-28 23:00	0.4720	0.2820	0.4521	0.5404	0.0910	0.0918	0.1306
1	2024-05-29 23:00	0.7596	0.4700	0.7212	0.8937	0.1465	0.1599	-1.1542
2	2024-05-30 23:00	1.4148	0.7847	1.3345	1.4812	0.3685	0.2974	-2.9150
Mean	NaT	0.8821	0.5122	0.8360	0.9717	0.2020	0.1830	-1.3128
SD	NaT	0.3945	0.2074	0.3693	0.3880	0.1199	0.0855	1.2484

	cutoff	MASE	RMSSE	MAE	RMSE	MAPE	SMAPE	R2
0	2024-05-28 23:00	0.4815	0.2874	0.4612	0.5507	0.0923	0.0932	0.0971
1	2024-05-29 23:00	0.6151	0.3732	0.5841	0.7097	0.1259	0.1268	-0.3584
2	2024-05-30 23:00	1.3749	0.7706	1.2969	1.4545	0.3597	0.2906	-2.7754
Mean	NaT	0.8238	0.4771	0.7807	0.9050	0.1926	0.1702	-1.0123
SD	NaT	0.3934	0.2105	0.3684	0.3940	0.1189	0.0863	1.2605

	y_pred
2024-06-01 00:00	4.2961
2024-06-01 01:00	4.3711
2024-06-01 02:00	4.2527
2024-06-01 03:00	4.3277
2024-06-01 04:00	4.2093
2024-06-01 05:00	4.2843
2024-06-01 06:00	4.1659
2024-06-01 07:00	4.2409
2024-06-01 08:00	4.1225
2024-06-01 09:00	4.1975
2024-06-01 10:00	4.0791
2024-06-01 11:00	4.1541
2024-06-01 12:00	4.0357
2024-06-01 13:00	4.1108
2024-06-01 14:00	3.9923
2024-06-01 15:00	4.0674
2024-06-01 16:00	3.9489
2024-06-01 17:00	4.0240
2024-06-01 18:00	3.9055
2024-06-01 19:00	3.9806
2024-06-01 20:00	3.8621
2024-06-01 21:00	3.9372
2024-06-01 22:00	3.8187
2024-06-01 23:00	3.8938